[GPU-Plugin] Multi-GPU gpu_id bug fixes for grow_gpu_hist and grow_gpu methods, and additional documentation for the gpu plugin. (#2463)
This commit is contained in:
parent
91dae84a00
commit
6b287177c8
2
.gitignore
vendored
2
.gitignore
vendored
@ -15,7 +15,7 @@
|
|||||||
*.Rcheck
|
*.Rcheck
|
||||||
*.rds
|
*.rds
|
||||||
*.tar.gz
|
*.tar.gz
|
||||||
*txt*
|
#*txt*
|
||||||
*conf
|
*conf
|
||||||
*buffer
|
*buffer
|
||||||
*model
|
*model
|
||||||
|
|||||||
@ -3,11 +3,8 @@ project (xgboost)
|
|||||||
find_package(OpenMP)
|
find_package(OpenMP)
|
||||||
|
|
||||||
option(PLUGIN_UPDATER_GPU "Build GPU accelerated tree construction plugin")
|
option(PLUGIN_UPDATER_GPU "Build GPU accelerated tree construction plugin")
|
||||||
set(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING
|
|
||||||
"Space separated list of compute versions to be built against")
|
|
||||||
if(PLUGIN_UPDATER_GPU)
|
if(PLUGIN_UPDATER_GPU)
|
||||||
cmake_minimum_required (VERSION 3.5)
|
cmake_minimum_required (VERSION 3.5)
|
||||||
find_package(CUDA REQUIRED)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||||
@ -83,6 +80,14 @@ set(RABIT_SOURCES
|
|||||||
rabit/src/c_api.cc
|
rabit/src/c_api.cc
|
||||||
)
|
)
|
||||||
|
|
||||||
|
set(NCCL_SOURCES
|
||||||
|
nccl/src/*.cu
|
||||||
|
)
|
||||||
|
set(UPDATER_GPU_SOURCES
|
||||||
|
plugin/updater_gpu/src/*.cu
|
||||||
|
plugin/updater_gpu/src/exact/*.cu
|
||||||
|
)
|
||||||
|
|
||||||
add_subdirectory(dmlc-core)
|
add_subdirectory(dmlc-core)
|
||||||
|
|
||||||
add_library(rabit STATIC ${RABIT_SOURCES})
|
add_library(rabit STATIC ${RABIT_SOURCES})
|
||||||
@ -102,35 +107,44 @@ endif()
|
|||||||
set(LINK_LIBRARIES dmlccore rabit)
|
set(LINK_LIBRARIES dmlccore rabit)
|
||||||
|
|
||||||
if(PLUGIN_UPDATER_GPU)
|
if(PLUGIN_UPDATER_GPU)
|
||||||
# nccl
|
find_package(CUDA REQUIRED)
|
||||||
set(LINK_LIBRARIES ${LINK_LIBRARIES} nccl)
|
|
||||||
add_subdirectory(nccl)
|
# nccl
|
||||||
set(NCCL_DIRECTORY ${PROJECT_SOURCE_DIR}/nccl)
|
set(LINK_LIBRARIES ${LINK_LIBRARIES} nccl)
|
||||||
include_directories(${NCCL_DIRECTORY}/src)
|
add_subdirectory(nccl)
|
||||||
set(LINK_LIBRARIES ${LINK_LIBRARIES} ${CUDA_LIBRARIES})
|
set(NCCL_DIRECTORY ${PROJECT_SOURCE_DIR}/nccl)
|
||||||
#Find cub
|
include_directories(${NCCL_DIRECTORY}/src)
|
||||||
set(CUB_DIRECTORY ${PROJECT_SOURCE_DIR}/cub/)
|
|
||||||
include_directories(${CUB_DIRECTORY})
|
#Find cub
|
||||||
#Find googletest
|
set(CUB_DIRECTORY ${PROJECT_SOURCE_DIR}/cub/)
|
||||||
set(GTEST_DIRECTORY "${CACHE_PREFIX}" CACHE PATH "Googletest directory")
|
include_directories(${CUB_DIRECTORY})
|
||||||
include_directories(${GTEST_DIRECTORY}/include)
|
|
||||||
#gencode flags
|
#Find googletest
|
||||||
set(GENCODE_FLAGS "")
|
set(GTEST_DIRECTORY "${CACHE_PREFIX}" CACHE PATH "Googletest directory")
|
||||||
foreach(ver ${GPU_COMPUTE_VER})
|
include_directories(${GTEST_DIRECTORY}/include)
|
||||||
set(GENCODE_FLAGS "${GENCODE_FLAGS}-gencode arch=compute_${ver},code=sm_${ver};")
|
|
||||||
endforeach()
|
#gencode flags
|
||||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;${GENCODE_FLAGS};-lineinfo;")
|
set(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING
|
||||||
if(NOT MSVC)
|
"Space separated list of compute versions to be built against")
|
||||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC")
|
|
||||||
endif()
|
set(GENCODE_FLAGS "")
|
||||||
set(CUDA_SOURCES
|
foreach(ver ${GPU_COMPUTE_VER})
|
||||||
plugin/updater_gpu/src/updater_gpu.cu
|
set(GENCODE_FLAGS "${GENCODE_FLAGS}-gencode arch=compute_${ver},code=sm_${ver};")
|
||||||
plugin/updater_gpu/src/gpu_hist_builder.cu
|
endforeach()
|
||||||
)
|
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;${GENCODE_FLAGS};-lineinfo;")
|
||||||
# use below for forcing specific arch
|
if(NOT MSVC)
|
||||||
cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
|
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC")
|
||||||
|
endif()
|
||||||
|
set(CUDA_SOURCES
|
||||||
|
plugin/updater_gpu/src/updater_gpu.cu
|
||||||
|
plugin/updater_gpu/src/gpu_hist_builder.cu
|
||||||
|
)
|
||||||
|
# use below for forcing specific arch
|
||||||
|
cuda_compile(CUDA_OBJS ${CUDA_SOURCES} ${CUDA_NVCC_FLAGS})
|
||||||
|
|
||||||
|
|
||||||
else()
|
else()
|
||||||
set(CUDA_OBJS "")
|
set(CUDA_OBJS "")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_library(objxgboost OBJECT ${SOURCES})
|
add_library(objxgboost OBJECT ${SOURCES})
|
||||||
|
|||||||
15
Makefile
15
Makefile
@ -96,7 +96,7 @@ endif
|
|||||||
CFLAGS += $(OPENMP_FLAGS)
|
CFLAGS += $(OPENMP_FLAGS)
|
||||||
|
|
||||||
# for using GPUs
|
# for using GPUs
|
||||||
GPU_COMPUTE_VER ?= 50 52 60 61
|
GPU_COMPUTE_VER ?= 35 50 52 60 61
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
INCLUDES = -Iinclude -I$(DMLC_CORE)/include -I$(RABIT)/include
|
INCLUDES = -Iinclude -I$(DMLC_CORE)/include -I$(RABIT)/include
|
||||||
INCLUDES += -I$(CUB_PATH)
|
INCLUDES += -I$(CUB_PATH)
|
||||||
@ -106,14 +106,13 @@ NVCC_FLAGS = --std=c++11 $(CODE) $(INCLUDES) -lineinfo --expt-extended-lambda
|
|||||||
NVCC_FLAGS += -Xcompiler=$(OPENMP_FLAGS) -Xcompiler=-fPIC
|
NVCC_FLAGS += -Xcompiler=$(OPENMP_FLAGS) -Xcompiler=-fPIC
|
||||||
ifeq ($(PLUGIN_UPDATER_GPU),ON)
|
ifeq ($(PLUGIN_UPDATER_GPU),ON)
|
||||||
CUDA_ROOT = $(shell dirname $(shell dirname $(shell which $(NVCC))))
|
CUDA_ROOT = $(shell dirname $(shell dirname $(shell which $(NVCC))))
|
||||||
INCLUDES += -I$(CUDA_ROOT)/include
|
INCLUDES += -I$(CUDA_ROOT)/include -Inccl/src/
|
||||||
LDFLAGS += -L$(CUDA_ROOT)/lib64 -lcudart
|
LDFLAGS += -L$(CUDA_ROOT)/lib64 -lcudart -lcudadevrt -Lnccl/build/lib/ -lnccl_static -lm -ldl -lrt
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# specify tensor path
|
# specify tensor path
|
||||||
.PHONY: clean all lint clean_all doxygen rcpplint pypack Rpack Rbuild Rcheck java pylint
|
.PHONY: clean all lint clean_all doxygen rcpplint pypack Rpack Rbuild Rcheck java pylint
|
||||||
|
|
||||||
|
|
||||||
all: lib/libxgboost.a $(XGBOOST_DYLIB) xgboost
|
all: lib/libxgboost.a $(XGBOOST_DYLIB) xgboost
|
||||||
|
|
||||||
$(DMLC_CORE)/libdmlc.a: $(wildcard $(DMLC_CORE)/src/*.cc $(DMLC_CORE)/src/*/*.cc)
|
$(DMLC_CORE)/libdmlc.a: $(wildcard $(DMLC_CORE)/src/*.cc $(DMLC_CORE)/src/*/*.cc)
|
||||||
@ -143,7 +142,7 @@ build/%.o: src/%.cc
|
|||||||
$(CXX) -c $(CFLAGS) $< -o $@
|
$(CXX) -c $(CFLAGS) $< -o $@
|
||||||
|
|
||||||
# order of this rule matters wrt %.cc rule below!
|
# order of this rule matters wrt %.cc rule below!
|
||||||
build_plugin/%.o: plugin/%.cu
|
build_plugin/%.o: plugin/%.cu build_nccl
|
||||||
@mkdir -p $(@D)
|
@mkdir -p $(@D)
|
||||||
$(NVCC) -c $(NVCC_FLAGS) $< -o $@
|
$(NVCC) -c $(NVCC_FLAGS) $< -o $@
|
||||||
|
|
||||||
@ -152,6 +151,11 @@ build_plugin/%.o: plugin/%.cc
|
|||||||
$(CXX) $(CFLAGS) -MM -MT build_plugin/$*.o $< >build_plugin/$*.d
|
$(CXX) $(CFLAGS) -MM -MT build_plugin/$*.o $< >build_plugin/$*.d
|
||||||
$(CXX) -c $(CFLAGS) $< -o $@
|
$(CXX) -c $(CFLAGS) $< -o $@
|
||||||
|
|
||||||
|
build_nccl:
|
||||||
|
@mkdir -p build/include
|
||||||
|
cd build/include ; ln -sf ../../nccl/src/nccl.h .
|
||||||
|
cd nccl ; make -j ; cd ..
|
||||||
|
|
||||||
# The should be equivalent to $(ALL_OBJ) except for build/cli_main.o
|
# The should be equivalent to $(ALL_OBJ) except for build/cli_main.o
|
||||||
amalgamation/xgboost-all0.o: amalgamation/xgboost-all0.cc
|
amalgamation/xgboost-all0.o: amalgamation/xgboost-all0.cc
|
||||||
$(CXX) -c $(CFLAGS) $< -o $@
|
$(CXX) -c $(CFLAGS) $< -o $@
|
||||||
@ -173,6 +177,7 @@ jvm-packages/lib/libxgboost4j.so: jvm-packages/xgboost4j/src/native/xgboost4j.cp
|
|||||||
@mkdir -p $(@D)
|
@mkdir -p $(@D)
|
||||||
$(CXX) $(CFLAGS) $(JAVAINCFLAGS) -shared -o $@ $(filter %.cpp %.o %.a, $^) $(LDFLAGS)
|
$(CXX) $(CFLAGS) $(JAVAINCFLAGS) -shared -o $@ $(filter %.cpp %.o %.a, $^) $(LDFLAGS)
|
||||||
|
|
||||||
|
|
||||||
xgboost: $(CLI_OBJ) $(ALL_DEP)
|
xgboost: $(CLI_OBJ) $(ALL_DEP)
|
||||||
$(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
|
$(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
|
||||||
|
|
||||||
|
|||||||
2
cub
2
cub
@ -1 +1 @@
|
|||||||
Subproject commit 89de7ab20167909bc2c4f8acd397671c47cf3c0d
|
Subproject commit f3937a96fdec78a73446aaaa114c112ff31f5503
|
||||||
2
nccl
2
nccl
@ -1 +1 @@
|
|||||||
Subproject commit 93183bca921b2e8e1754e27e1b43d73cf6caec9d
|
Subproject commit 8ec6c27a33a900fb92f7e39acc73cc0f43e8539b
|
||||||
@ -63,6 +63,22 @@ submodule: The plugin also depends on CUB 1.6.4 - https://nvlabs.github.io/cub/
|
|||||||
|
|
||||||
submodule: NVIDIA NCCL from https://github.com/NVIDIA/nccl with windows port allowed by git@github.com:h2oai/nccl.git
|
submodule: NVIDIA NCCL from https://github.com/NVIDIA/nccl with windows port allowed by git@github.com:h2oai/nccl.git
|
||||||
|
|
||||||
|
## Download full repo + full submodules for your choice (or empty) path <mypath>
|
||||||
|
|
||||||
|
git clone --recursive https://github.com/dmlc/xgboost.git <mypath>
|
||||||
|
|
||||||
|
## Download with shallow submodules for much quicker download:
|
||||||
|
|
||||||
|
git 2.9.0+ (assumes only HEAD used for all submodules, but not true currently for dmlc-core and rabbit)
|
||||||
|
|
||||||
|
git clone --recursive --shallow-submodules https://github.com/dmlc/xgboost.git <mypath>
|
||||||
|
|
||||||
|
git 2.9.0-: (only cub is shallow, as largest repo)
|
||||||
|
|
||||||
|
git clone https://github.com/dmlc/xgboost.git <mypath>
|
||||||
|
cd <mypath>
|
||||||
|
bash plugin/updater/gpu/gitshallow_submodules.sh
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
From the command line on Linux starting from the xgboost directory:
|
From the command line on Linux starting from the xgboost directory:
|
||||||
@ -84,12 +100,18 @@ $ mkdir build
|
|||||||
$ cd build
|
$ cd build
|
||||||
$ cmake .. -G"Visual Studio 14 2015 Win64" -DPLUGIN_UPDATER_GPU=ON
|
$ cmake .. -G"Visual Studio 14 2015 Win64" -DPLUGIN_UPDATER_GPU=ON
|
||||||
```
|
```
|
||||||
Cmake will generate an xgboost.sln solution file in the build directory. Build this solution in release mode as a x64 build.
|
Cmake will create an xgboost.sln solution file in the build directory. Build this solution in release mode as a x64 build.
|
||||||
|
|
||||||
Visual studio community 2015, supported by cuda toolkit (http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/#axzz4isREr2nS), can be downloaded from: https://my.visualstudio.com/Downloads?q=Visual%20Studio%20Community%202015 . You may also be able to use a later version of visual studio depending on whether the CUDA toolkit supports it. Note that Mingw cannot be used with cuda.
|
Visual studio community 2015, supported by cuda toolkit (http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/#axzz4isREr2nS), can be downloaded from: https://my.visualstudio.com/Downloads?q=Visual%20Studio%20Community%202015 . You may also be able to use a later version of visual studio depending on whether the CUDA toolkit supports it. Note that Mingw cannot be used with cuda.
|
||||||
|
|
||||||
|
### For other nccl libraries
|
||||||
|
|
||||||
|
On some systems, nccl libraries are specific to a particular system (IBM Power or nvidia-docker) and can enable use of nvlink (between GPUs or even between GPUs and system memory). In that case, one wants to avoid the static nccl library by changing "STATIC" to "SHARED" in nccl/CMakeLists.txt and deleting the shared nccl library created (so that the system one is used).
|
||||||
|
|
||||||
### For Developers!
|
### For Developers!
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
In case you want to build only for a specific GPU(s), for eg. GP100 and GP102,
|
In case you want to build only for a specific GPU(s), for eg. GP100 and GP102,
|
||||||
whose compute capability are 60 and 61 respectively:
|
whose compute capability are 60 and 61 respectively:
|
||||||
```bash
|
```bash
|
||||||
@ -101,12 +123,12 @@ By default, the versions will include support for all GPUs in Maxwell and Pascal
|
|||||||
Now, it also supports the usual 'make' flow to build gpu-enabled tree construction plugins. It's currently only tested on Linux. From the xgboost directory
|
Now, it also supports the usual 'make' flow to build gpu-enabled tree construction plugins. It's currently only tested on Linux. From the xgboost directory
|
||||||
```bash
|
```bash
|
||||||
# make sure CUDA SDK bin directory is in the 'PATH' env variable
|
# make sure CUDA SDK bin directory is in the 'PATH' env variable
|
||||||
$ make PLUGIN_UPDATER_GPU=ON
|
$ make -j PLUGIN_UPDATER_GPU=ON
|
||||||
```
|
```
|
||||||
|
|
||||||
Similar to cmake, if you want to build only for a specific GPU(s):
|
Similar to cmake, if you want to build only for a specific GPU(s):
|
||||||
```bash
|
```bash
|
||||||
$ make PLUGIN_UPDATER_GPU=ON GPU_COMPUTE_VER="60 61"
|
$ make -j PLUGIN_UPDATER_GPU=ON GPU_COMPUTE_VER="60 61"
|
||||||
```
|
```
|
||||||
|
|
||||||
### For Developers!
|
### For Developers!
|
||||||
|
|||||||
@ -16,6 +16,8 @@ def run_benchmark(args, gpu_algorithm, cpu_algorithm):
|
|||||||
param = {'objective': 'binary:logistic',
|
param = {'objective': 'binary:logistic',
|
||||||
'max_depth': 6,
|
'max_depth': 6,
|
||||||
'silent': 1,
|
'silent': 1,
|
||||||
|
'n_gpus': 1,
|
||||||
|
'gpu_id': 0,
|
||||||
'eval_metric': 'auc'}
|
'eval_metric': 'auc'}
|
||||||
|
|
||||||
param['tree_method'] = gpu_algorithm
|
param['tree_method'] = gpu_algorithm
|
||||||
@ -41,9 +43,9 @@ args = parser.parse_args()
|
|||||||
|
|
||||||
if 'gpu_hist' in args.algorithm:
|
if 'gpu_hist' in args.algorithm:
|
||||||
run_benchmark(args, args.algorithm, 'hist')
|
run_benchmark(args, args.algorithm, 'hist')
|
||||||
if 'gpu_exact' in args.algorithm:
|
elif 'gpu_exact' in args.algorithm:
|
||||||
run_benchmark(args, args.algorithm, 'exact')
|
run_benchmark(args, args.algorithm, 'exact')
|
||||||
if 'all' in args.algorithm:
|
elif 'all' in args.algorithm:
|
||||||
run_benchmark(args, 'gpu_exact', 'exact')
|
run_benchmark(args, 'gpu_exact', 'exact')
|
||||||
run_benchmark(args, 'gpu_hist', 'hist')
|
run_benchmark(args, 'gpu_hist', 'hist')
|
||||||
|
|
||||||
|
|||||||
12
plugin/updater_gpu/gitshallow_submodules.sh
Normal file
12
plugin/updater_gpu/gitshallow_submodules.sh
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
git submodule init
|
||||||
|
for i in $(git submodule | awk '{print $2}'); do
|
||||||
|
spath=$(git config -f .gitmodules --get submodule.$i.path)
|
||||||
|
surl=$(git config -f .gitmodules --get submodule.$i.url)
|
||||||
|
if [ $spath == "cub" ]
|
||||||
|
then
|
||||||
|
git submodule update --depth 3 $spath
|
||||||
|
else
|
||||||
|
git submodule update $spath
|
||||||
|
fi
|
||||||
|
done
|
||||||
@ -2,16 +2,16 @@
|
|||||||
* Copyright 2017 XGBoost contributors
|
* Copyright 2017 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include <cstdio>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "../../../src/common/random.h"
|
#include "../../../src/common/random.h"
|
||||||
#include "../../../src/tree/param.h"
|
#include "../../../src/tree/param.h"
|
||||||
#include "device_helpers.cuh"
|
|
||||||
#include "types.cuh"
|
|
||||||
#include <string>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <cstdio>
|
|
||||||
#include "cub/cub.cuh"
|
#include "cub/cub.cuh"
|
||||||
#include "device_helpers.cuh"
|
#include "device_helpers.cuh"
|
||||||
|
#include "device_helpers.cuh"
|
||||||
|
#include "types.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
@ -172,8 +172,8 @@ inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline std::vector<int> col_sample(std::vector<int> features, float colsample) {
|
inline std::vector<int> col_sample(std::vector<int> features, float colsample) {
|
||||||
int n = colsample * features.size();
|
CHECK_GT(features.size(), 0);
|
||||||
CHECK_GT(n, 0);
|
int n = std::max(1,static_cast<int>(colsample * features.size()));
|
||||||
|
|
||||||
std::shuffle(features.begin(), features.end(), common::GlobalRandom());
|
std::shuffle(features.begin(), features.end(), common::GlobalRandom());
|
||||||
features.resize(n);
|
features.resize(n);
|
||||||
@ -202,17 +202,18 @@ struct GpairCallbackOp {
|
|||||||
* @param offsets the segments
|
* @param offsets the segments
|
||||||
*/
|
*/
|
||||||
template <typename T1, typename T2>
|
template <typename T1, typename T2>
|
||||||
void segmentedSort(dh::CubMemory &tmp_mem, dh::dvec2<T1> &keys, dh::dvec2<T2> &vals,
|
void segmentedSort(dh::CubMemory& tmp_mem, dh::dvec2<T1>& keys,
|
||||||
int nVals, int nSegs, dh::dvec<int> &offsets, int start=0,
|
dh::dvec2<T2>& vals, int nVals, int nSegs,
|
||||||
int end=sizeof(T1)*8) {
|
dh::dvec<int>& offsets, int start = 0,
|
||||||
|
int end = sizeof(T1) * 8) {
|
||||||
size_t tmpSize;
|
size_t tmpSize;
|
||||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||||
NULL, tmpSize, keys.buff(), vals.buff(), nVals, nSegs,
|
NULL, tmpSize, keys.buff(), vals.buff(), nVals, nSegs, offsets.data(),
|
||||||
offsets.data(), offsets.data()+1, start, end));
|
offsets.data() + 1, start, end));
|
||||||
tmp_mem.LazyAllocate(tmpSize);
|
tmp_mem.LazyAllocate(tmpSize);
|
||||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||||
tmp_mem.d_temp_storage, tmpSize, keys.buff(), vals.buff(),
|
tmp_mem.d_temp_storage, tmpSize, keys.buff(), vals.buff(), nVals, nSegs,
|
||||||
nVals, nSegs, offsets.data(), offsets.data()+1, start, end));
|
offsets.data(), offsets.data() + 1, start, end));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -223,11 +224,11 @@ void segmentedSort(dh::CubMemory &tmp_mem, dh::dvec2<T1> &keys, dh::dvec2<T2> &v
|
|||||||
* @param nVals number of elements in the input array
|
* @param nVals number of elements in the input array
|
||||||
*/
|
*/
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
|
void sumReduction(dh::CubMemory& tmp_mem, dh::dvec<T>& in, dh::dvec<T>& out,
|
||||||
int nVals) {
|
int nVals) {
|
||||||
size_t tmpSize;
|
size_t tmpSize;
|
||||||
dh::safe_cuda(cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(),
|
dh::safe_cuda(
|
||||||
nVals));
|
cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(), nVals));
|
||||||
tmp_mem.LazyAllocate(tmpSize);
|
tmp_mem.LazyAllocate(tmpSize);
|
||||||
dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
|
dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
|
||||||
in.data(), out.data(), nVals));
|
in.data(), out.data(), nVals));
|
||||||
@ -239,9 +240,10 @@ void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
|
|||||||
* @param len number of elements i the buffer
|
* @param len number of elements i the buffer
|
||||||
* @param def default value to be filled
|
* @param def default value to be filled
|
||||||
*/
|
*/
|
||||||
template <typename T, int BlkDim=256, int ItemsPerThread=4>
|
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
||||||
void fillConst(int device_idx, T* out, int len, T def) {
|
void fillConst(int device_idx, T* out, int len, T def) {
|
||||||
dh::launch_n<ItemsPerThread,BlkDim>(device_idx, len, [=] __device__(int i) { out[i] = def; });
|
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, len,
|
||||||
|
[=] __device__(int i) { out[i] = def; });
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -253,17 +255,17 @@ void fillConst(int device_idx, T* out, int len, T def) {
|
|||||||
* @param instId gather indices
|
* @param instId gather indices
|
||||||
* @param nVals length of the buffers
|
* @param nVals length of the buffers
|
||||||
*/
|
*/
|
||||||
template <typename T1, typename T2, int BlkDim=256, int ItemsPerThread=4>
|
template <typename T1, typename T2, int BlkDim = 256, int ItemsPerThread = 4>
|
||||||
void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
|
void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2,
|
||||||
int nVals) {
|
const int* instId, int nVals) {
|
||||||
dh::launch_n<ItemsPerThread,BlkDim>
|
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||||
(device_idx, nVals, [=] __device__(int i) {
|
[=] __device__(int i) {
|
||||||
int iid = instId[i];
|
int iid = instId[i];
|
||||||
T1 v1 = in1[iid];
|
T1 v1 = in1[iid];
|
||||||
T2 v2 = in2[iid];
|
T2 v2 = in2[iid];
|
||||||
out1[i] = v1;
|
out1[i] = v1;
|
||||||
out2[i] = v2;
|
out2[i] = v2;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -273,13 +275,13 @@ void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2, co
|
|||||||
* @param instId gather indices
|
* @param instId gather indices
|
||||||
* @param nVals length of the buffers
|
* @param nVals length of the buffers
|
||||||
*/
|
*/
|
||||||
template <typename T, int BlkDim=256, int ItemsPerThread=4>
|
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
||||||
void gather(int device_idx, T* out, const T* in, const int* instId, int nVals) {
|
void gather(int device_idx, T* out, const T* in, const int* instId, int nVals) {
|
||||||
dh::launch_n<ItemsPerThread,BlkDim>
|
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||||
(device_idx, nVals, [=] __device__(int i) {
|
[=] __device__(int i) {
|
||||||
int iid = instId[i];
|
int iid = instId[i];
|
||||||
out[i] = in[iid];
|
out[i] = in[iid];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
|
|||||||
@ -9,11 +9,11 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
#include <numeric>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <numeric>
|
|
||||||
#include <cub/cub.cuh>
|
|
||||||
|
|
||||||
#ifndef NCCL
|
#ifndef NCCL
|
||||||
#define NCCL 1
|
#define NCCL 1
|
||||||
@ -29,8 +29,8 @@
|
|||||||
|
|
||||||
namespace dh {
|
namespace dh {
|
||||||
|
|
||||||
#define HOST_DEV_INLINE __host__ __device__ __forceinline__
|
#define HOST_DEV_INLINE __host__ __device__ __forceinline__
|
||||||
#define DEV_INLINE __device__ __forceinline__
|
#define DEV_INLINE __device__ __forceinline__
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Error handling functions
|
* Error handling functions
|
||||||
@ -126,6 +126,11 @@ inline std::string device_name(int device_idx) {
|
|||||||
return std::string(prop.name);
|
return std::string(prop.name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ensure gpu_id is correct, so not dependent upon user knowing details
|
||||||
|
inline int get_device_idx(int gpu_id) {
|
||||||
|
// protect against overrun for gpu_id
|
||||||
|
return (std::abs(gpu_id) + 0) % dh::n_visible_devices();
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Timers
|
* Timers
|
||||||
@ -309,11 +314,13 @@ enum memory_type { DEVICE, DEVICE_MANAGED };
|
|||||||
|
|
||||||
template <memory_type MemoryT>
|
template <memory_type MemoryT>
|
||||||
class bulk_allocator;
|
class bulk_allocator;
|
||||||
template <typename T> class dvec2;
|
template <typename T>
|
||||||
|
class dvec2;
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class dvec {
|
class dvec {
|
||||||
friend class dvec2<T>;
|
friend class dvec2<T>;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
T *_ptr;
|
T *_ptr;
|
||||||
size_t _size;
|
size_t _size;
|
||||||
@ -327,9 +334,10 @@ class dvec {
|
|||||||
_ptr = static_cast<T *>(ptr);
|
_ptr = static_cast<T *>(ptr);
|
||||||
_size = size;
|
_size = size;
|
||||||
_device_idx = device_idx;
|
_device_idx = device_idx;
|
||||||
|
safe_cuda(cudaSetDevice(_device_idx));
|
||||||
}
|
}
|
||||||
|
|
||||||
dvec() : _ptr(NULL), _size(0), _device_idx(0) {}
|
dvec() : _ptr(NULL), _size(0), _device_idx(-1) {}
|
||||||
size_t size() const { return _size; }
|
size_t size() const { return _size; }
|
||||||
int device_idx() const { return _device_idx; }
|
int device_idx() const { return _device_idx; }
|
||||||
bool empty() const { return _ptr == NULL || _size == 0; }
|
bool empty() const { return _ptr == NULL || _size == 0; }
|
||||||
@ -378,6 +386,10 @@ class dvec {
|
|||||||
if (other.device_idx() == this->device_idx()) {
|
if (other.device_idx() == this->device_idx()) {
|
||||||
thrust::copy(other.tbegin(), other.tend(), this->tbegin());
|
thrust::copy(other.tbegin(), other.tend(), this->tbegin());
|
||||||
} else {
|
} else {
|
||||||
|
std::cout << "deviceother: " << other.device_idx()
|
||||||
|
<< " devicethis: " << this->device_idx() << std::endl;
|
||||||
|
std::cout << "size deviceother: " << other.size()
|
||||||
|
<< " devicethis: " << this->device_idx() << std::endl;
|
||||||
throw std::runtime_error("Cannot copy to/from different devices");
|
throw std::runtime_error("Cannot copy to/from different devices");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -401,26 +413,24 @@ class dvec {
|
|||||||
*/
|
*/
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class dvec2 {
|
class dvec2 {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
dvec<T> _d1, _d2;
|
dvec<T> _d1, _d2;
|
||||||
cub::DoubleBuffer<T> _buff;
|
cub::DoubleBuffer<T> _buff;
|
||||||
int _device_idx;
|
int _device_idx;
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void external_allocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
|
void external_allocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
|
||||||
if (!empty()) {
|
if (!empty()) {
|
||||||
throw std::runtime_error("Tried to allocate dvec2 but already allocated");
|
throw std::runtime_error("Tried to allocate dvec2 but already allocated");
|
||||||
}
|
}
|
||||||
|
_device_idx = device_idx;
|
||||||
_d1.external_allocate(_device_idx, ptr1, size);
|
_d1.external_allocate(_device_idx, ptr1, size);
|
||||||
_d2.external_allocate(_device_idx, ptr2, size);
|
_d2.external_allocate(_device_idx, ptr2, size);
|
||||||
_buff.d_buffers[0] = static_cast<T *>(ptr1);
|
_buff.d_buffers[0] = static_cast<T *>(ptr1);
|
||||||
_buff.d_buffers[1] = static_cast<T *>(ptr2);
|
_buff.d_buffers[1] = static_cast<T *>(ptr2);
|
||||||
_buff.selector = 0;
|
_buff.selector = 0;
|
||||||
_device_idx = device_idx;
|
|
||||||
}
|
}
|
||||||
dvec2() : _d1(), _d2(), _buff(), _device_idx(0) {}
|
dvec2() : _d1(), _d2(), _buff(), _device_idx(-1) {}
|
||||||
|
|
||||||
size_t size() const { return _d1.size(); }
|
size_t size() const { return _d1.size(); }
|
||||||
int device_idx() const { return _device_idx; }
|
int device_idx() const { return _device_idx; }
|
||||||
@ -433,7 +443,7 @@ class dvec2 {
|
|||||||
|
|
||||||
T *current() { return _buff.Current(); }
|
T *current() { return _buff.Current(); }
|
||||||
|
|
||||||
dvec<T> ¤t_dvec() { return _buff.selector == 0? d1() : d2(); }
|
dvec<T> ¤t_dvec() { return _buff.selector == 0 ? d1() : d2(); }
|
||||||
|
|
||||||
T *other() { return _buff.Alternate(); }
|
T *other() { return _buff.Alternate(); }
|
||||||
};
|
};
|
||||||
@ -459,7 +469,8 @@ class bulk_allocator {
|
|||||||
|
|
||||||
template <typename T, typename SizeT, typename... Args>
|
template <typename T, typename SizeT, typename... Args>
|
||||||
size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size, Args... args) {
|
size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size, Args... args) {
|
||||||
return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
|
return get_size_bytes<T, SizeT>(first_vec, first_size) +
|
||||||
|
get_size_bytes(args...);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename SizeT>
|
template <typename T, typename SizeT>
|
||||||
@ -496,20 +507,23 @@ class bulk_allocator {
|
|||||||
|
|
||||||
template <typename T, typename SizeT, typename... Args>
|
template <typename T, typename SizeT, typename... Args>
|
||||||
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size, Args... args) {
|
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size, Args... args) {
|
||||||
return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
|
return get_size_bytes<T, SizeT>(first_vec, first_size) +
|
||||||
|
get_size_bytes(args...);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename SizeT>
|
template <typename T, typename SizeT>
|
||||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size) {
|
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
|
||||||
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
|
SizeT first_size) {
|
||||||
static_cast<void *>(ptr+align_round_up(first_size * sizeof(T))),
|
first_vec->external_allocate(
|
||||||
first_size);
|
device_idx, static_cast<void *>(ptr),
|
||||||
|
static_cast<void *>(ptr + align_round_up(first_size * sizeof(T))),
|
||||||
|
first_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename SizeT, typename... Args>
|
template <typename T, typename SizeT, typename... Args>
|
||||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size,
|
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
|
||||||
Args... args) {
|
SizeT first_size, Args... args) {
|
||||||
allocate_dvec<T,SizeT>(device_idx, ptr, first_vec, first_size);
|
allocate_dvec<T, SizeT>(device_idx, ptr, first_vec, first_size);
|
||||||
ptr += (align_round_up(first_size * sizeof(T)) * 2);
|
ptr += (align_round_up(first_size * sizeof(T)) * 2);
|
||||||
allocate_dvec(device_idx, ptr, args...);
|
allocate_dvec(device_idx, ptr, args...);
|
||||||
}
|
}
|
||||||
@ -706,11 +720,11 @@ struct BernoulliRng {
|
|||||||
* @param name name used to track later
|
* @param name name used to track later
|
||||||
* @param stream cuda stream where to measure time
|
* @param stream cuda stream where to measure time
|
||||||
*/
|
*/
|
||||||
#define TIMEIT(call, name) \
|
#define TIMEIT(call, name) \
|
||||||
do { \
|
do { \
|
||||||
dh::Timer t1234; \
|
dh::Timer t1234; \
|
||||||
call; \
|
call; \
|
||||||
t1234.printElapsed(name); \
|
t1234.printElapsed(name); \
|
||||||
} while(0)
|
} while (0)
|
||||||
|
|
||||||
} // namespace dh
|
} // namespace dh
|
||||||
|
|||||||
@ -17,8 +17,8 @@
|
|||||||
|
|
||||||
#include "../../../../src/tree/param.h"
|
#include "../../../../src/tree/param.h"
|
||||||
#include "../common.cuh"
|
#include "../common.cuh"
|
||||||
#include "node.cuh"
|
|
||||||
#include "loss_functions.cuh"
|
#include "loss_functions.cuh"
|
||||||
|
#include "node.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
@ -45,7 +45,7 @@ HOST_DEV_INLINE Split maxSplit(Split a, Split b) {
|
|||||||
out.index = b.index;
|
out.index = b.index;
|
||||||
} else if (a.score == b.score) {
|
} else if (a.score == b.score) {
|
||||||
out.score = a.score;
|
out.score = a.score;
|
||||||
out.index = (a.index < b.index)? a.index : b.index;
|
out.index = (a.index < b.index) ? a.index : b.index;
|
||||||
} else {
|
} else {
|
||||||
out.score = a.score;
|
out.score = a.score;
|
||||||
out.index = a.index;
|
out.index = a.index;
|
||||||
@ -54,7 +54,7 @@ HOST_DEV_INLINE Split maxSplit(Split a, Split b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
DEV_INLINE void atomicArgMax(Split* address, Split val) {
|
DEV_INLINE void atomicArgMax(Split* address, Split val) {
|
||||||
unsigned long long* intAddress = (unsigned long long*) address;
|
unsigned long long* intAddress = (unsigned long long*)address;
|
||||||
unsigned long long old = *intAddress;
|
unsigned long long old = *intAddress;
|
||||||
unsigned long long assumed;
|
unsigned long long assumed;
|
||||||
do {
|
do {
|
||||||
@ -65,23 +65,19 @@ DEV_INLINE void atomicArgMax(Split* address, Split val) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename node_id_t>
|
template <typename node_id_t>
|
||||||
DEV_INLINE void argMaxWithAtomics(int id, Split* nodeSplits,
|
DEV_INLINE void argMaxWithAtomics(
|
||||||
const gpu_gpair* gradScans,
|
int id, Split* nodeSplits, const gpu_gpair* gradScans,
|
||||||
const gpu_gpair* gradSums, const float* vals,
|
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
||||||
const int* colIds,
|
const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
|
||||||
const node_id_t* nodeAssigns,
|
node_id_t nodeStart, int len, const TrainParam& param) {
|
||||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
|
||||||
node_id_t nodeStart, int len,
|
|
||||||
const TrainParam ¶m) {
|
|
||||||
int nodeId = nodeAssigns[id];
|
int nodeId = nodeAssigns[id];
|
||||||
///@todo: this is really a bad check! but will be fixed when we move
|
///@todo: this is really a bad check! but will be fixed when we move
|
||||||
/// to key-based reduction
|
/// to key-based reduction
|
||||||
if ((id == 0) || !((nodeId == nodeAssigns[id-1]) &&
|
if ((id == 0) ||
|
||||||
(colIds[id] == colIds[id-1]) &&
|
!((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
|
||||||
(vals[id] == vals[id-1]))) {
|
(vals[id] == vals[id - 1]))) {
|
||||||
if (nodeId != UNUSED_NODE) {
|
if (nodeId != UNUSED_NODE) {
|
||||||
int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart,
|
int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
|
||||||
nUniqKeys);
|
|
||||||
gpu_gpair colSum = gradSums[sumId];
|
gpu_gpair colSum = gradSums[sumId];
|
||||||
int uid = nodeId - nodeStart;
|
int uid = nodeId - nodeStart;
|
||||||
Node<node_id_t> n = nodes[nodeId];
|
Node<node_id_t> n = nodes[nodeId];
|
||||||
@ -90,23 +86,20 @@ DEV_INLINE void argMaxWithAtomics(int id, Split* nodeSplits,
|
|||||||
bool tmp;
|
bool tmp;
|
||||||
Split s;
|
Split s;
|
||||||
gpu_gpair missing = parentSum - colSum;
|
gpu_gpair missing = parentSum - colSum;
|
||||||
s.score = loss_chg_missing(gradScans[id], missing, parentSum,
|
s.score = loss_chg_missing(gradScans[id], missing, parentSum, parentGain,
|
||||||
parentGain, param, tmp);
|
param, tmp);
|
||||||
s.index = id;
|
s.index = id;
|
||||||
atomicArgMax(nodeSplits+uid, s);
|
atomicArgMax(nodeSplits + uid, s);
|
||||||
} // end if nodeId != UNUSED_NODE
|
} // end if nodeId != UNUSED_NODE
|
||||||
} // end if id == 0 ...
|
} // end if id == 0 ...
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename node_id_t>
|
template <typename node_id_t>
|
||||||
__global__ void atomicArgMaxByKeyGmem(Split* nodeSplits,
|
__global__ void atomicArgMaxByKeyGmem(
|
||||||
const gpu_gpair* gradScans,
|
Split* nodeSplits, const gpu_gpair* gradScans, const gpu_gpair* gradSums,
|
||||||
const gpu_gpair* gradSums,
|
const float* vals, const int* colIds, const node_id_t* nodeAssigns,
|
||||||
const float* vals, const int* colIds,
|
const Node<node_id_t>* nodes, int nUniqKeys, node_id_t nodeStart, int len,
|
||||||
const node_id_t* nodeAssigns,
|
const TrainParam param) {
|
||||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
|
||||||
node_id_t nodeStart, int len,
|
|
||||||
const TrainParam param) {
|
|
||||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||||
const int stride = blockDim.x * gridDim.x;
|
const int stride = blockDim.x * gridDim.x;
|
||||||
for (; id < len; id += stride) {
|
for (; id < len; id += stride) {
|
||||||
@ -116,19 +109,16 @@ __global__ void atomicArgMaxByKeyGmem(Split* nodeSplits,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename node_id_t>
|
template <typename node_id_t>
|
||||||
__global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
|
__global__ void atomicArgMaxByKeySmem(
|
||||||
const gpu_gpair* gradScans,
|
Split* nodeSplits, const gpu_gpair* gradScans, const gpu_gpair* gradSums,
|
||||||
const gpu_gpair* gradSums,
|
const float* vals, const int* colIds, const node_id_t* nodeAssigns,
|
||||||
const float* vals, const int* colIds,
|
const Node<node_id_t>* nodes, int nUniqKeys, node_id_t nodeStart, int len,
|
||||||
const node_id_t* nodeAssigns,
|
const TrainParam param) {
|
||||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
|
||||||
node_id_t nodeStart, int len,
|
|
||||||
const TrainParam param) {
|
|
||||||
extern __shared__ char sArr[];
|
extern __shared__ char sArr[];
|
||||||
Split* sNodeSplits = (Split*)sArr;
|
Split* sNodeSplits = (Split*)sArr;
|
||||||
int tid = threadIdx.x;
|
int tid = threadIdx.x;
|
||||||
Split defVal;
|
Split defVal;
|
||||||
#pragma unroll 1
|
#pragma unroll 1
|
||||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||||
sNodeSplits[i] = defVal;
|
sNodeSplits[i] = defVal;
|
||||||
}
|
}
|
||||||
@ -142,7 +132,7 @@ __global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||||
Split s = sNodeSplits[i];
|
Split s = sNodeSplits[i];
|
||||||
atomicArgMax(nodeSplits+i, s);
|
atomicArgMax(nodeSplits + i, s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -162,28 +152,30 @@ __global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
|
|||||||
* @param param training parameters
|
* @param param training parameters
|
||||||
* @param algo which algorithm to use for argmax_by_key
|
* @param algo which algorithm to use for argmax_by_key
|
||||||
*/
|
*/
|
||||||
template <typename node_id_t, int BLKDIM=256, int ITEMS_PER_THREAD=4>
|
template <typename node_id_t, int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
|
||||||
void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
|
void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
|
||||||
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
const gpu_gpair* gradSums, const float* vals,
|
||||||
const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
|
const int* colIds, const node_id_t* nodeAssigns,
|
||||||
|
const Node<node_id_t>* nodes, int nUniqKeys,
|
||||||
node_id_t nodeStart, int len, const TrainParam param,
|
node_id_t nodeStart, int len, const TrainParam param,
|
||||||
ArgMaxByKeyAlgo algo) {
|
ArgMaxByKeyAlgo algo) {
|
||||||
fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(param.gpu_id, nodeSplits, nUniqKeys, Split());
|
fillConst<Split, BLKDIM, ITEMS_PER_THREAD>(dh::get_device_idx(param.gpu_id),
|
||||||
int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
|
nodeSplits, nUniqKeys, Split());
|
||||||
switch(algo) {
|
int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD * BLKDIM);
|
||||||
case ABK_GMEM:
|
switch (algo) {
|
||||||
atomicArgMaxByKeyGmem<node_id_t><<<nBlks,BLKDIM>>>
|
case ABK_GMEM:
|
||||||
(nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
atomicArgMaxByKeyGmem<node_id_t><<<nBlks, BLKDIM>>>(
|
||||||
nUniqKeys, nodeStart, len, param);
|
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||||
break;
|
nUniqKeys, nodeStart, len, param);
|
||||||
case ABK_SMEM:
|
break;
|
||||||
atomicArgMaxByKeySmem<node_id_t>
|
case ABK_SMEM:
|
||||||
<<<nBlks,BLKDIM,sizeof(Split)*nUniqKeys>>>
|
atomicArgMaxByKeySmem<
|
||||||
(nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
node_id_t><<<nBlks, BLKDIM, sizeof(Split) * nUniqKeys>>>(
|
||||||
nUniqKeys, nodeStart, len, param);
|
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||||
break;
|
nUniqKeys, nodeStart, len, param);
|
||||||
default:
|
break;
|
||||||
throw std::runtime_error("argMaxByKey: Bad algo passed!");
|
default:
|
||||||
|
throw std::runtime_error("argMaxByKey: Bad algo passed!");
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -18,7 +18,6 @@
|
|||||||
#include "../common.cuh"
|
#include "../common.cuh"
|
||||||
#include "gradients.cuh"
|
#include "gradients.cuh"
|
||||||
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
namespace exact {
|
namespace exact {
|
||||||
@ -41,7 +40,7 @@ static const int NONE_KEY = -100;
|
|||||||
* @param tmpKeys keys buffer
|
* @param tmpKeys keys buffer
|
||||||
* @param size number of elements that will be scanned
|
* @param size number of elements that will be scanned
|
||||||
*/
|
*/
|
||||||
template <int BLKDIM_L1L3=256>
|
template <int BLKDIM_L1L3 = 256>
|
||||||
int scanTempBufferSize(int size) {
|
int scanTempBufferSize(int size) {
|
||||||
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
||||||
return nBlks;
|
return nBlks;
|
||||||
@ -49,7 +48,7 @@ int scanTempBufferSize(int size) {
|
|||||||
|
|
||||||
struct AddByKey {
|
struct AddByKey {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HOST_DEV_INLINE T operator()(const T &first, const T &second) const {
|
HOST_DEV_INLINE T operator()(const T& first, const T& second) const {
|
||||||
T result;
|
T result;
|
||||||
if (first.key == second.key) {
|
if (first.key == second.key) {
|
||||||
result.key = first.key;
|
result.key = first.key;
|
||||||
@ -74,7 +73,7 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
|
|||||||
typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
|
typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
|
||||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||||
Pair threadData;
|
Pair threadData;
|
||||||
int tid = blockIdx.x*BLKDIM_L1L3 + threadIdx.x;
|
int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
|
||||||
if (tid < size) {
|
if (tid < size) {
|
||||||
myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||||
myValue = get(tid, vals, instIds);
|
myValue = get(tid, vals, instIds);
|
||||||
@ -82,7 +81,7 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
|
|||||||
myKey = NONE_KEY;
|
myKey = NONE_KEY;
|
||||||
myValue = 0.f;
|
myValue = 0.f;
|
||||||
}
|
}
|
||||||
threadData.key = myKey;
|
threadData.key = myKey;
|
||||||
threadData.value = myValue;
|
threadData.value = myValue;
|
||||||
// get previous key, especially needed for the last thread in this block
|
// get previous key, especially needed for the last thread in this block
|
||||||
// in order to pass on the partial scan values.
|
// in order to pass on the partial scan values.
|
||||||
@ -90,18 +89,17 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
|
|||||||
// else, the result of this shuffle operation will be undefined
|
// else, the result of this shuffle operation will be undefined
|
||||||
int previousKey = __shfl_up(myKey, 1);
|
int previousKey = __shfl_up(myKey, 1);
|
||||||
// Collectively compute the block-wide exclusive prefix sum
|
// Collectively compute the block-wide exclusive prefix sum
|
||||||
BlockScan(temp_storage).ExclusiveScan(threadData, threadData, rootPair,
|
BlockScan(temp_storage)
|
||||||
AddByKey());
|
.ExclusiveScan(threadData, threadData, rootPair, AddByKey());
|
||||||
if (tid < size) {
|
if (tid < size) {
|
||||||
scans[tid] = threadData.value;
|
scans[tid] = threadData.value;
|
||||||
} else {
|
} else {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (threadIdx.x == BLKDIM_L1L3 - 1) {
|
if (threadIdx.x == BLKDIM_L1L3 - 1) {
|
||||||
threadData.value = (myKey == previousKey)?
|
threadData.value =
|
||||||
threadData.value :
|
(myKey == previousKey) ? threadData.value : gpu_gpair(0.0f, 0.0f);
|
||||||
gpu_gpair(0.0f, 0.0f);
|
mKeys[blockIdx.x] = myKey;
|
||||||
mKeys[blockIdx.x] = myKey;
|
|
||||||
mScans[blockIdx.x] = threadData.value + myValue;
|
mScans[blockIdx.x] = threadData.value + myValue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -111,11 +109,10 @@ __global__ void cubScanByKeyL2(gpu_gpair* mScans, int* mKeys, int mLength) {
|
|||||||
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
|
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
|
||||||
Pair threadData;
|
Pair threadData;
|
||||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||||
for (int i = threadIdx.x; i < mLength; i += BLKSIZE-1) {
|
for (int i = threadIdx.x; i < mLength; i += BLKSIZE - 1) {
|
||||||
threadData.key = mKeys[i];
|
threadData.key = mKeys[i];
|
||||||
threadData.value = mScans[i];
|
threadData.value = mScans[i];
|
||||||
BlockScan(temp_storage).InclusiveScan(threadData, threadData,
|
BlockScan(temp_storage).InclusiveScan(threadData, threadData, AddByKey());
|
||||||
AddByKey());
|
|
||||||
mScans[i] = threadData.value;
|
mScans[i] = threadData.value;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
@ -136,15 +133,14 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
|
|||||||
__shared__ char gradBuff[sizeof(gpu_gpair)];
|
__shared__ char gradBuff[sizeof(gpu_gpair)];
|
||||||
__shared__ int s_mKeys;
|
__shared__ int s_mKeys;
|
||||||
gpu_gpair* s_mScans = (gpu_gpair*)gradBuff;
|
gpu_gpair* s_mScans = (gpu_gpair*)gradBuff;
|
||||||
if(tid >= size)
|
if (tid >= size) return;
|
||||||
return;
|
|
||||||
// cache block-wide partial scan info
|
// cache block-wide partial scan info
|
||||||
if (relId == 0) {
|
if (relId == 0) {
|
||||||
s_mKeys = (blockIdx.x > 0)? mKeys[blockIdx.x-1] : NONE_KEY;
|
s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : NONE_KEY;
|
||||||
s_mScans[0] = (blockIdx.x > 0)? mScans[blockIdx.x-1] : gpu_gpair();
|
s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : gpu_gpair();
|
||||||
}
|
}
|
||||||
int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||||
int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid-1, keys, colIds,
|
int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid - 1, keys, colIds,
|
||||||
nodeStart, nUniqKeys);
|
nodeStart, nUniqKeys);
|
||||||
gpu_gpair myValue = scans[tid];
|
gpu_gpair myValue = scans[tid];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@ -162,9 +158,11 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Performs fused reduce and scan by key functionality. It is assumed that
|
* @brief Performs fused reduce and scan by key functionality. It is assumed
|
||||||
|
* that
|
||||||
* the keys occur contiguously!
|
* the keys occur contiguously!
|
||||||
* @param sums the output gradient reductions for each element performed key-wise
|
* @param sums the output gradient reductions for each element performed
|
||||||
|
* key-wise
|
||||||
* @param scans the output gradient scans for each element performed key-wise
|
* @param scans the output gradient scans for each element performed key-wise
|
||||||
* @param vals the gradients evaluated for each observation.
|
* @param vals the gradients evaluated for each observation.
|
||||||
* @param instIds instance ids for each element
|
* @param instIds instance ids for each element
|
||||||
@ -179,20 +177,20 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
|
|||||||
* @param colIds column indices for each element in the array
|
* @param colIds column indices for each element in the array
|
||||||
* @param nodeStart index of the leftmost node in the current level
|
* @param nodeStart index of the leftmost node in the current level
|
||||||
*/
|
*/
|
||||||
template <typename node_id_t, int BLKDIM_L1L3=256, int BLKDIM_L2=512>
|
template <typename node_id_t, int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
|
||||||
void reduceScanByKey(gpu_gpair* sums, gpu_gpair* scans, const gpu_gpair* vals,
|
void reduceScanByKey(gpu_gpair* sums, gpu_gpair* scans, const gpu_gpair* vals,
|
||||||
const int* instIds, const node_id_t* keys, int size,
|
const int* instIds, const node_id_t* keys, int size,
|
||||||
int nUniqKeys, int nCols, gpu_gpair* tmpScans,
|
int nUniqKeys, int nCols, gpu_gpair* tmpScans,
|
||||||
int* tmpKeys, const int* colIds, node_id_t nodeStart) {
|
int* tmpKeys, const int* colIds, node_id_t nodeStart) {
|
||||||
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
||||||
cudaMemset(sums, 0, nUniqKeys*nCols*sizeof(gpu_gpair));
|
cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(gpu_gpair));
|
||||||
cubScanByKeyL1<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
|
cubScanByKeyL1<node_id_t, BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>(
|
||||||
(scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
||||||
nodeStart, size);
|
nodeStart, size);
|
||||||
cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
|
cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
|
||||||
cubScanByKeyL3<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
|
cubScanByKeyL3<node_id_t, BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>(
|
||||||
(sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
||||||
nodeStart, size);
|
nodeStart, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace exact
|
} // namespace exact
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights
|
||||||
|
* reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
@ -15,18 +16,17 @@
|
|||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "../../../../src/tree/param.h"
|
|
||||||
#include "xgboost/tree_updater.h"
|
|
||||||
#include "cub/cub.cuh"
|
|
||||||
#include "../common.cuh"
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "loss_functions.cuh"
|
#include "../../../../src/tree/param.h"
|
||||||
#include "gradients.cuh"
|
#include "../common.cuh"
|
||||||
#include "node.cuh"
|
|
||||||
#include "argmax_by_key.cuh"
|
#include "argmax_by_key.cuh"
|
||||||
#include "split2node.cuh"
|
#include "cub/cub.cuh"
|
||||||
#include "fused_scan_reduce_by_key.cuh"
|
#include "fused_scan_reduce_by_key.cuh"
|
||||||
|
#include "gradients.cuh"
|
||||||
|
#include "loss_functions.cuh"
|
||||||
|
#include "node.cuh"
|
||||||
|
#include "split2node.cuh"
|
||||||
|
#include "xgboost/tree_updater.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
@ -48,8 +48,8 @@ template <typename node_id_t>
|
|||||||
__global__ void assignColIds(int* colIds, const int* colOffsets) {
|
__global__ void assignColIds(int* colIds, const int* colOffsets) {
|
||||||
int myId = blockIdx.x;
|
int myId = blockIdx.x;
|
||||||
int start = colOffsets[myId];
|
int start = colOffsets[myId];
|
||||||
int end = colOffsets[myId+1];
|
int end = colOffsets[myId + 1];
|
||||||
for (int id = start+threadIdx.x; id < end; id += blockDim.x) {
|
for (int id = start + threadIdx.x; id < end; id += blockDim.x) {
|
||||||
colIds[id] = myId;
|
colIds[id] = myId;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -70,7 +70,7 @@ __global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
|
|||||||
node_id_t result;
|
node_id_t result;
|
||||||
if (n.isLeaf() || n.isUnused()) {
|
if (n.isLeaf() || n.isUnused()) {
|
||||||
result = UNUSED_NODE;
|
result = UNUSED_NODE;
|
||||||
} else if(n.isDefaultLeft()) {
|
} else if (n.isDefaultLeft()) {
|
||||||
result = (2 * n.id) + 1;
|
result = (2 * n.id) + 1;
|
||||||
} else {
|
} else {
|
||||||
result = (2 * n.id) + 2;
|
result = (2 * n.id) + 2;
|
||||||
@ -81,8 +81,9 @@ __global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
|
|||||||
template <typename node_id_t>
|
template <typename node_id_t>
|
||||||
__global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
|
__global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
|
||||||
const node_id_t* nodeIds, const int* instId,
|
const node_id_t* nodeIds, const int* instId,
|
||||||
const Node<node_id_t>* nodes, const int* colOffsets,
|
const Node<node_id_t>* nodes,
|
||||||
const float* vals, int nVals, int nCols) {
|
const int* colOffsets, const float* vals,
|
||||||
|
int nVals, int nCols) {
|
||||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||||
const int stride = blockDim.x * gridDim.x;
|
const int stride = blockDim.x * gridDim.x;
|
||||||
for (; id < nVals; id += stride) {
|
for (; id < nVals; id += stride) {
|
||||||
@ -95,7 +96,7 @@ __global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
|
|||||||
if (nId != UNUSED_NODE) {
|
if (nId != UNUSED_NODE) {
|
||||||
const Node<node_id_t> n = nodes[nId];
|
const Node<node_id_t> n = nodes[nId];
|
||||||
int colId = n.colIdx;
|
int colId = n.colIdx;
|
||||||
//printf("nid=%d colId=%d id=%d\n", nId, colId, id);
|
// printf("nid=%d colId=%d id=%d\n", nId, colId, id);
|
||||||
int start = colOffsets[colId];
|
int start = colOffsets[colId];
|
||||||
int end = colOffsets[colId + 1];
|
int end = colOffsets[colId + 1];
|
||||||
///@todo: too much wasteful threads!!
|
///@todo: too much wasteful threads!!
|
||||||
@ -114,20 +115,26 @@ __global__ void markLeavesKernel(Node<node_id_t>* nodes, int len) {
|
|||||||
int lid = (id << 1) + 1;
|
int lid = (id << 1) + 1;
|
||||||
int rid = (id << 1) + 2;
|
int rid = (id << 1) + 2;
|
||||||
if ((lid >= len) || (rid >= len)) {
|
if ((lid >= len) || (rid >= len)) {
|
||||||
nodes[id].score = -FLT_MAX; // bottom-most nodes
|
nodes[id].score = -FLT_MAX; // bottom-most nodes
|
||||||
} else if (nodes[lid].isUnused() && nodes[rid].isUnused()) {
|
} else if (nodes[lid].isUnused() && nodes[rid].isUnused()) {
|
||||||
nodes[id].score = -FLT_MAX; // unused child nodes
|
nodes[id].score = -FLT_MAX; // unused child nodes
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// unit test forward declaration for friend function access
|
// unit test forward declaration for friend function access
|
||||||
template <typename node_id_t> void testSmallData();
|
template <typename node_id_t>
|
||||||
template <typename node_id_t> void testLargeData();
|
void testSmallData();
|
||||||
template <typename node_id_t> void testAllocate();
|
template <typename node_id_t>
|
||||||
template <typename node_id_t> void testMarkLeaves();
|
void testLargeData();
|
||||||
template <typename node_id_t> void testDense2Sparse();
|
template <typename node_id_t>
|
||||||
template <typename node_id_t> class GPUBuilder;
|
void testAllocate();
|
||||||
|
template <typename node_id_t>
|
||||||
|
void testMarkLeaves();
|
||||||
|
template <typename node_id_t>
|
||||||
|
void testDense2Sparse();
|
||||||
|
template <typename node_id_t>
|
||||||
|
class GPUBuilder;
|
||||||
template <typename node_id_t>
|
template <typename node_id_t>
|
||||||
std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
|
std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
|
||||||
const std::string& file,
|
const std::string& file,
|
||||||
@ -136,7 +143,7 @@ std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
|
|||||||
template <typename node_id_t>
|
template <typename node_id_t>
|
||||||
class GPUBuilder {
|
class GPUBuilder {
|
||||||
public:
|
public:
|
||||||
GPUBuilder(): allocated(false) {}
|
GPUBuilder() : allocated(false) {}
|
||||||
|
|
||||||
~GPUBuilder() {}
|
~GPUBuilder() {}
|
||||||
|
|
||||||
@ -146,10 +153,10 @@ class GPUBuilder {
|
|||||||
maxLeaves = 1 << param.max_depth;
|
maxLeaves = 1 << param.max_depth;
|
||||||
}
|
}
|
||||||
|
|
||||||
void UpdateParam(const TrainParam ¶m) { this->param = param; }
|
void UpdateParam(const TrainParam& param) { this->param = param; }
|
||||||
|
|
||||||
/// @note: Update should be only after Init!!
|
/// @note: Update should be only after Init!!
|
||||||
void Update(const std::vector<bst_gpair>& gpair, DMatrix *hMat,
|
void Update(const std::vector<bst_gpair>& gpair, DMatrix* hMat,
|
||||||
RegTree* hTree) {
|
RegTree* hTree) {
|
||||||
if (!allocated) {
|
if (!allocated) {
|
||||||
setupOneTimeData(*hMat);
|
setupOneTimeData(*hMat);
|
||||||
@ -171,7 +178,7 @@ class GPUBuilder {
|
|||||||
dense2sparse(*hTree);
|
dense2sparse(*hTree);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend void testSmallData<node_id_t>();
|
friend void testSmallData<node_id_t>();
|
||||||
friend void testLargeData<node_id_t>();
|
friend void testLargeData<node_id_t>();
|
||||||
friend void testAllocate<node_id_t>();
|
friend void testAllocate<node_id_t>();
|
||||||
@ -194,7 +201,7 @@ private:
|
|||||||
dh::dvec<gpu_gpair> gradsInst;
|
dh::dvec<gpu_gpair> gradsInst;
|
||||||
dh::dvec2<node_id_t> nodeAssigns;
|
dh::dvec2<node_id_t> nodeAssigns;
|
||||||
dh::dvec2<int> nodeLocations;
|
dh::dvec2<int> nodeLocations;
|
||||||
dh::dvec<Node<node_id_t> > nodes;
|
dh::dvec<Node<node_id_t>> nodes;
|
||||||
dh::dvec<node_id_t> nodeAssignsPerInst;
|
dh::dvec<node_id_t> nodeAssignsPerInst;
|
||||||
dh::dvec<gpu_gpair> gradSums;
|
dh::dvec<gpu_gpair> gradSums;
|
||||||
dh::dvec<gpu_gpair> gradScans;
|
dh::dvec<gpu_gpair> gradScans;
|
||||||
@ -218,35 +225,26 @@ private:
|
|||||||
argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
|
argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
|
||||||
vals.current(), colIds.data(), nodeAssigns.current(),
|
vals.current(), colIds.data(), nodeAssigns.current(),
|
||||||
nodes.data(), nNodes, nodeStart, nVals, param,
|
nodes.data(), nNodes, nodeStart, nVals, param,
|
||||||
level<=MAX_ABK_LEVELS? ABK_SMEM : ABK_GMEM);
|
level <= MAX_ABK_LEVELS ? ABK_SMEM : ABK_GMEM);
|
||||||
split2node(nodes.data(), nodeSplits.data(), gradScans.data(),
|
split2node(nodes.data(), nodeSplits.data(), gradScans.data(),
|
||||||
gradSums.data(), vals.current(), colIds.data(), colOffsets.data(),
|
gradSums.data(), vals.current(), colIds.data(),
|
||||||
nodeAssigns.current(), nNodes, nodeStart, nCols, param);
|
colOffsets.data(), nodeAssigns.current(), nNodes, nodeStart,
|
||||||
|
nCols, param);
|
||||||
}
|
}
|
||||||
|
|
||||||
void allocateAllData(int offsetSize) {
|
void allocateAllData(int offsetSize) {
|
||||||
int tmpBuffSize = scanTempBufferSize(nVals);
|
int tmpBuffSize = scanTempBufferSize(nVals);
|
||||||
ba.allocate(param.gpu_id,
|
ba.allocate(dh::get_device_idx(param.gpu_id), &vals, nVals, &vals_cached,
|
||||||
&vals, nVals,
|
nVals, &instIds, nVals, &instIds_cached, nVals, &colOffsets,
|
||||||
&vals_cached, nVals,
|
offsetSize, &gradsInst, nRows, &nodeAssigns, nVals,
|
||||||
&instIds, nVals,
|
&nodeLocations, nVals, &nodes, maxNodes, &nodeAssignsPerInst,
|
||||||
&instIds_cached, nVals,
|
nRows, &gradSums, maxLeaves * nCols, &gradScans, nVals,
|
||||||
&colOffsets, offsetSize,
|
&nodeSplits, maxLeaves, &tmpScanGradBuff, tmpBuffSize,
|
||||||
&gradsInst, nRows,
|
&tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
|
||||||
&nodeAssigns, nVals,
|
|
||||||
&nodeLocations, nVals,
|
|
||||||
&nodes, maxNodes,
|
|
||||||
&nodeAssignsPerInst, nRows,
|
|
||||||
&gradSums, maxLeaves*nCols,
|
|
||||||
&gradScans, nVals,
|
|
||||||
&nodeSplits, maxLeaves,
|
|
||||||
&tmpScanGradBuff, tmpBuffSize,
|
|
||||||
&tmpScanKeyBuff, tmpBuffSize,
|
|
||||||
&colIds, nVals);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void setupOneTimeData(DMatrix& hMat) {
|
void setupOneTimeData(DMatrix& hMat) {
|
||||||
size_t free_memory = dh::available_memory(param.gpu_id);
|
size_t free_memory = dh::available_memory(dh::get_device_idx(param.gpu_id));
|
||||||
if (!hMat.SingleColBlock()) {
|
if (!hMat.SingleColBlock()) {
|
||||||
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
|
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
|
||||||
}
|
}
|
||||||
@ -259,7 +257,8 @@ private:
|
|||||||
if (!param.silent) {
|
if (!param.silent) {
|
||||||
const int mb_size = 1048576;
|
const int mb_size = 1048576;
|
||||||
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
||||||
<< free_memory / mb_size << " MB on " << dh::device_name(param.gpu_id);
|
<< free_memory / mb_size << " MB on "
|
||||||
|
<< dh::device_name(dh::get_device_idx(param.gpu_id));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -282,9 +281,10 @@ private:
|
|||||||
iter->BeforeFirst();
|
iter->BeforeFirst();
|
||||||
while (iter->Next()) {
|
while (iter->Next()) {
|
||||||
const ColBatch& batch = iter->Value();
|
const ColBatch& batch = iter->Value();
|
||||||
for (int i=0;i<batch.size;i++) {
|
for (int i = 0; i < batch.size; i++) {
|
||||||
const ColBatch::Inst& col = batch[i];
|
const ColBatch::Inst& col = batch[i];
|
||||||
for (const ColBatch::Entry* it=col.data;it!=col.data+col.length;it++) {
|
for (const ColBatch::Entry* it = col.data; it != col.data + col.length;
|
||||||
|
it++) {
|
||||||
int inst_id = static_cast<int>(it->index);
|
int inst_id = static_cast<int>(it->index);
|
||||||
fval.push_back(it->fvalue);
|
fval.push_back(it->fvalue);
|
||||||
fId.push_back(inst_id);
|
fId.push_back(inst_id);
|
||||||
@ -301,16 +301,17 @@ private:
|
|||||||
vals.current_dvec() = fval;
|
vals.current_dvec() = fval;
|
||||||
instIds.current_dvec() = fId;
|
instIds.current_dvec() = fId;
|
||||||
colOffsets = offset;
|
colOffsets = offset;
|
||||||
segmentedSort<float,int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
|
segmentedSort<float, int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
|
||||||
vals_cached = vals.current_dvec();
|
vals_cached = vals.current_dvec();
|
||||||
instIds_cached = instIds.current_dvec();
|
instIds_cached = instIds.current_dvec();
|
||||||
assignColIds<node_id_t><<<nCols,512>>>(colIds.data(), colOffsets.data());
|
assignColIds<node_id_t><<<nCols, 512>>>(colIds.data(), colOffsets.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
void transferGrads(const std::vector<bst_gpair>& gpair) {
|
void transferGrads(const std::vector<bst_gpair>& gpair) {
|
||||||
// HACK
|
// HACK
|
||||||
dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
|
dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
|
||||||
sizeof(gpu_gpair)*nRows, cudaMemcpyHostToDevice));
|
sizeof(gpu_gpair) * nRows,
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
// evaluate the full-grad reduction for the root node
|
// evaluate the full-grad reduction for the root node
|
||||||
sumReduction<gpu_gpair>(tmp_mem, gradsInst, gradSums, nRows);
|
sumReduction<gpu_gpair>(tmp_mem, gradsInst, gradSums, nRows);
|
||||||
}
|
}
|
||||||
@ -324,25 +325,23 @@ private:
|
|||||||
// for root node, just update the gradient/score/weight/id info
|
// for root node, just update the gradient/score/weight/id info
|
||||||
// before splitting it! Currently all data is on GPU, hence this
|
// before splitting it! Currently all data is on GPU, hence this
|
||||||
// stupid little kernel
|
// stupid little kernel
|
||||||
initRootNode<<<1,1>>>(nodes.data(), gradSums.data(), param);
|
initRootNode<<<1, 1>>>(nodes.data(), gradSums.data(), param);
|
||||||
} else {
|
} else {
|
||||||
const int BlkDim = 256;
|
const int BlkDim = 256;
|
||||||
const int ItemsPerThread = 4;
|
const int ItemsPerThread = 4;
|
||||||
// assign default node ids first
|
// assign default node ids first
|
||||||
int nBlks = dh::div_round_up(nRows, BlkDim);
|
int nBlks = dh::div_round_up(nRows, BlkDim);
|
||||||
fillDefaultNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
|
fillDefaultNodeIds<<<nBlks, BlkDim>>>(nodeAssignsPerInst.data(),
|
||||||
nodes.data(), nRows);
|
nodes.data(), nRows);
|
||||||
// evaluate the correct child indices of non-missing values next
|
// evaluate the correct child indices of non-missing values next
|
||||||
nBlks = dh::div_round_up(nVals, BlkDim*ItemsPerThread);
|
nBlks = dh::div_round_up(nVals, BlkDim * ItemsPerThread);
|
||||||
assignNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
|
assignNodeIds<<<nBlks, BlkDim>>>(
|
||||||
nodeLocations.current(),
|
nodeAssignsPerInst.data(), nodeLocations.current(),
|
||||||
nodeAssigns.current(),
|
nodeAssigns.current(), instIds.current(), nodes.data(),
|
||||||
instIds.current(), nodes.data(),
|
colOffsets.data(), vals.current(), nVals, nCols);
|
||||||
colOffsets.data(), vals.current(),
|
|
||||||
nVals, nCols);
|
|
||||||
// gather the node assignments across all other columns too
|
// gather the node assignments across all other columns too
|
||||||
gather<node_id_t>(param.gpu_id, nodeAssigns.current(), nodeAssignsPerInst.data(),
|
gather<node_id_t>(dh::get_device_idx(param.gpu_id), nodeAssigns.current(),
|
||||||
instIds.current(), nVals);
|
nodeAssignsPerInst.data(), instIds.current(), nVals);
|
||||||
sortKeys(level);
|
sortKeys(level);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -351,9 +350,10 @@ private:
|
|||||||
// segmented-sort the arrays based on node-id's
|
// segmented-sort the arrays based on node-id's
|
||||||
// but we don't need more than level+1 bits for sorting!
|
// but we don't need more than level+1 bits for sorting!
|
||||||
segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
|
segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
|
||||||
0, level+1);
|
0, level + 1);
|
||||||
gather<float,int>(param.gpu_id, vals.other(), vals.current(), instIds.other(),
|
gather<float, int>(dh::get_device_idx(param.gpu_id), vals.other(),
|
||||||
instIds.current(), nodeLocations.current(), nVals);
|
vals.current(), instIds.other(), instIds.current(),
|
||||||
|
nodeLocations.current(), nVals);
|
||||||
vals.buff().selector ^= 1;
|
vals.buff().selector ^= 1;
|
||||||
instIds.buff().selector ^= 1;
|
instIds.buff().selector ^= 1;
|
||||||
}
|
}
|
||||||
@ -361,11 +361,11 @@ private:
|
|||||||
void markLeaves() {
|
void markLeaves() {
|
||||||
const int BlkDim = 128;
|
const int BlkDim = 128;
|
||||||
int nBlks = dh::div_round_up(maxNodes, BlkDim);
|
int nBlks = dh::div_round_up(maxNodes, BlkDim);
|
||||||
markLeavesKernel<<<nBlks,BlkDim>>>(nodes.data(), maxNodes);
|
markLeavesKernel<<<nBlks, BlkDim>>>(nodes.data(), maxNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
void dense2sparse(RegTree &tree) {
|
void dense2sparse(RegTree& tree) {
|
||||||
std::vector<Node<node_id_t> > hNodes = nodes.as_vector();
|
std::vector<Node<node_id_t>> hNodes = nodes.as_vector();
|
||||||
int nodeId = 0;
|
int nodeId = 0;
|
||||||
for (int i = 0; i < maxNodes; ++i) {
|
for (int i = 0; i < maxNodes; ++i) {
|
||||||
const Node<node_id_t>& n = hNodes[i];
|
const Node<node_id_t>& n = hNodes[i];
|
||||||
@ -375,7 +375,7 @@ private:
|
|||||||
++nodeId;
|
++nodeId;
|
||||||
} else if (!hNodes[i].isUnused()) {
|
} else if (!hNodes[i].isUnused()) {
|
||||||
tree.AddChilds(nodeId);
|
tree.AddChilds(nodeId);
|
||||||
tree[nodeId].set_split(n.colIdx, n.threshold, n.dir==LeftDir);
|
tree[nodeId].set_split(n.colIdx, n.threshold, n.dir == LeftDir);
|
||||||
tree.stat(nodeId).loss_chg = n.score;
|
tree.stat(nodeId).loss_chg = n.score;
|
||||||
tree.stat(nodeId).sum_hess = n.gradSum.h;
|
tree.stat(nodeId).sum_hess = n.gradSum.h;
|
||||||
tree.stat(nodeId).base_weight = n.weight;
|
tree.stat(nodeId).base_weight = n.weight;
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights
|
||||||
|
* reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
@ -17,7 +18,6 @@
|
|||||||
|
|
||||||
#include "../common.cuh"
|
#include "../common.cuh"
|
||||||
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
namespace exact {
|
namespace exact {
|
||||||
@ -32,9 +32,9 @@ struct gpu_gpair {
|
|||||||
/** the 'h_i' as it appears in the xgboost paper */
|
/** the 'h_i' as it appears in the xgboost paper */
|
||||||
float h;
|
float h;
|
||||||
|
|
||||||
HOST_DEV_INLINE gpu_gpair(): g(0.f), h(0.f) {}
|
HOST_DEV_INLINE gpu_gpair() : g(0.f), h(0.f) {}
|
||||||
HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h): g(_g), h(_h) {}
|
HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h) : g(_g), h(_h) {}
|
||||||
HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a): g(a.g), h(a.h) {}
|
HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a) : g(a.g), h(a.h) {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Checks whether the hessian is more than the defined weight
|
* @brief Checks whether the hessian is more than the defined weight
|
||||||
@ -60,12 +60,12 @@ struct gpu_gpair {
|
|||||||
|
|
||||||
HOST_DEV_INLINE friend gpu_gpair operator+(const gpu_gpair& a,
|
HOST_DEV_INLINE friend gpu_gpair operator+(const gpu_gpair& a,
|
||||||
const gpu_gpair& b) {
|
const gpu_gpair& b) {
|
||||||
return gpu_gpair(a.g+b.g, a.h+b.h);
|
return gpu_gpair(a.g + b.g, a.h + b.h);
|
||||||
}
|
}
|
||||||
|
|
||||||
HOST_DEV_INLINE friend gpu_gpair operator-(const gpu_gpair& a,
|
HOST_DEV_INLINE friend gpu_gpair operator-(const gpu_gpair& a,
|
||||||
const gpu_gpair& b) {
|
const gpu_gpair& b) {
|
||||||
return gpu_gpair(a.g-b.g, a.h-b.h);
|
return gpu_gpair(a.g - b.g, a.h - b.h);
|
||||||
}
|
}
|
||||||
|
|
||||||
HOST_DEV_INLINE gpu_gpair(int value) {
|
HOST_DEV_INLINE gpu_gpair(int value) {
|
||||||
@ -73,7 +73,6 @@ struct gpu_gpair {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Gradient value getter function
|
* @brief Gradient value getter function
|
||||||
* @param id the index into the vals or instIds array to which to fetch
|
* @param id the index into the vals or instIds array to which to fetch
|
||||||
@ -81,7 +80,8 @@ struct gpu_gpair {
|
|||||||
* @param instIds instance index buffer
|
* @param instIds instance index buffer
|
||||||
* @return the expected gradient value
|
* @return the expected gradient value
|
||||||
*/
|
*/
|
||||||
HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals, const int* instIds) {
|
HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals,
|
||||||
|
const int* instIds) {
|
||||||
id = instIds[id];
|
id = instIds[id];
|
||||||
return vals[id];
|
return vals[id];
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights
|
||||||
|
* reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
@ -18,17 +19,13 @@
|
|||||||
#include "../common.cuh"
|
#include "../common.cuh"
|
||||||
#include "gradients.cuh"
|
#include "gradients.cuh"
|
||||||
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
namespace exact {
|
namespace exact {
|
||||||
|
|
||||||
HOST_DEV_INLINE float device_calc_loss_chg(const TrainParam ¶m,
|
HOST_DEV_INLINE float device_calc_loss_chg(
|
||||||
const gpu_gpair &scan,
|
const TrainParam ¶m, const gpu_gpair &scan, const gpu_gpair &missing,
|
||||||
const gpu_gpair &missing,
|
const gpu_gpair &parent_sum, const float &parent_gain, bool missing_left) {
|
||||||
const gpu_gpair &parent_sum,
|
|
||||||
const float &parent_gain,
|
|
||||||
bool missing_left) {
|
|
||||||
gpu_gpair left = scan;
|
gpu_gpair left = scan;
|
||||||
if (missing_left) {
|
if (missing_left) {
|
||||||
left += missing;
|
left += missing;
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights
|
||||||
|
* reserved.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
@ -15,9 +16,8 @@
|
|||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "gradients.cuh"
|
|
||||||
#include "../common.cuh"
|
#include "../common.cuh"
|
||||||
|
#include "gradients.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
@ -34,11 +34,9 @@ enum DefaultDirection {
|
|||||||
RightDir
|
RightDir
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/** used to assign default id to a Node */
|
/** used to assign default id to a Node */
|
||||||
static const int UNUSED_NODE = -1;
|
static const int UNUSED_NODE = -1;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @struct Split node.cuh
|
* @struct Split node.cuh
|
||||||
* @brief Abstraction of a possible split in the decision tree
|
* @brief Abstraction of a possible split in the decision tree
|
||||||
@ -49,7 +47,7 @@ struct Split {
|
|||||||
/** index where to split in the DMatrix */
|
/** index where to split in the DMatrix */
|
||||||
int index;
|
int index;
|
||||||
|
|
||||||
HOST_DEV_INLINE Split(): score(-FLT_MAX), index(INT_MAX) {}
|
HOST_DEV_INLINE Split() : score(-FLT_MAX), index(INT_MAX) {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Whether the split info is valid to be used to create a new child
|
* @brief Whether the split info is valid to be used to create a new child
|
||||||
@ -61,7 +59,6 @@ struct Split {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @struct Node node.cuh
|
* @struct Node node.cuh
|
||||||
* @brief Abstraction of a node in the decision tree
|
* @brief Abstraction of a node in the decision tree
|
||||||
@ -84,9 +81,14 @@ class Node {
|
|||||||
/** node id (used as key for reduce/scan) */
|
/** node id (used as key for reduce/scan) */
|
||||||
node_id_t id;
|
node_id_t id;
|
||||||
|
|
||||||
HOST_DEV_INLINE Node(): gradSum(), score(-FLT_MAX), weight(-FLT_MAX),
|
HOST_DEV_INLINE Node()
|
||||||
dir(LeftDir), threshold(0.f), colIdx(UNUSED_NODE),
|
: gradSum(),
|
||||||
id(UNUSED_NODE) {}
|
score(-FLT_MAX),
|
||||||
|
weight(-FLT_MAX),
|
||||||
|
dir(LeftDir),
|
||||||
|
threshold(0.f),
|
||||||
|
colIdx(UNUSED_NODE),
|
||||||
|
id(UNUSED_NODE) {}
|
||||||
|
|
||||||
/** Tells whether this node is part of the decision tree */
|
/** Tells whether this node is part of the decision tree */
|
||||||
HOST_DEV_INLINE bool isUnused() const { return (id == UNUSED_NODE); }
|
HOST_DEV_INLINE bool isUnused() const { return (id == UNUSED_NODE); }
|
||||||
@ -100,7 +102,6 @@ class Node {
|
|||||||
HOST_DEV_INLINE bool isDefaultLeft() const { return (dir == LeftDir); }
|
HOST_DEV_INLINE bool isDefaultLeft() const { return (dir == LeftDir); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @struct Segment node.cuh
|
* @struct Segment node.cuh
|
||||||
* @brief Space inefficient, but super easy to implement structure to define
|
* @brief Space inefficient, but super easy to implement structure to define
|
||||||
@ -112,7 +113,7 @@ struct Segment {
|
|||||||
/** end index of the segment */
|
/** end index of the segment */
|
||||||
int end;
|
int end;
|
||||||
|
|
||||||
HOST_DEV_INLINE Segment(): start(-1), end(-1) {}
|
HOST_DEV_INLINE Segment() : start(-1), end(-1) {}
|
||||||
|
|
||||||
/** Checks whether the current structure defines a valid segment */
|
/** Checks whether the current structure defines a valid segment */
|
||||||
HOST_DEV_INLINE bool isValid() const {
|
HOST_DEV_INLINE bool isValid() const {
|
||||||
@ -120,7 +121,6 @@ struct Segment {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @enum NodeType node.cuh
|
* @enum NodeType node.cuh
|
||||||
* @brief Useful to decribe the node type in a dense BFS-order tree array
|
* @brief Useful to decribe the node type in a dense BFS-order tree array
|
||||||
@ -134,7 +134,6 @@ enum NodeType {
|
|||||||
UNUSED
|
UNUSED
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Absolute BFS order IDs to col-wise unique IDs based on user input
|
* @brief Absolute BFS order IDs to col-wise unique IDs based on user input
|
||||||
* @param tid the index of the element that this thread should access
|
* @param tid the index of the element that this thread should access
|
||||||
|
|||||||
@ -17,9 +17,8 @@
|
|||||||
|
|
||||||
#include "../../../../src/tree/param.h"
|
#include "../../../../src/tree/param.h"
|
||||||
#include "gradients.cuh"
|
#include "gradients.cuh"
|
||||||
#include "node.cuh"
|
|
||||||
#include "loss_functions.cuh"
|
#include "loss_functions.cuh"
|
||||||
|
#include "node.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
@ -39,7 +38,7 @@ namespace exact {
|
|||||||
template <typename node_id_t>
|
template <typename node_id_t>
|
||||||
DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
|
DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
|
||||||
const gpu_gpair& grad,
|
const gpu_gpair& grad,
|
||||||
const TrainParam ¶m) {
|
const TrainParam& param) {
|
||||||
nodes[nid].gradSum = grad;
|
nodes[nid].gradSum = grad;
|
||||||
nodes[nid].score = CalcGain(param, grad.g, grad.h);
|
nodes[nid].score = CalcGain(param, grad.g, grad.h);
|
||||||
nodes[nid].weight = CalcWeight(param, grad.g, grad.h);
|
nodes[nid].weight = CalcWeight(param, grad.g, grad.h);
|
||||||
@ -58,18 +57,18 @@ DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
|
|||||||
template <typename node_id_t>
|
template <typename node_id_t>
|
||||||
DEV_INLINE void updateChildNodes(Node<node_id_t>* nodes, int pid,
|
DEV_INLINE void updateChildNodes(Node<node_id_t>* nodes, int pid,
|
||||||
const gpu_gpair& gradL, const gpu_gpair& gradR,
|
const gpu_gpair& gradL, const gpu_gpair& gradR,
|
||||||
const TrainParam ¶m) {
|
const TrainParam& param) {
|
||||||
int childId = (pid * 2) + 1;
|
int childId = (pid * 2) + 1;
|
||||||
updateOneChildNode(nodes, childId, gradL, param);
|
updateOneChildNode(nodes, childId, gradL, param);
|
||||||
updateOneChildNode(nodes, childId+1, gradR, param);
|
updateOneChildNode(nodes, childId + 1, gradR, param);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename node_id_t>
|
template <typename node_id_t>
|
||||||
DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
|
DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
|
||||||
const Node<node_id_t>& n, int absNodeId, int colId,
|
const Node<node_id_t>& n, int absNodeId,
|
||||||
const gpu_gpair& gradScan,
|
int colId, const gpu_gpair& gradScan,
|
||||||
const gpu_gpair& colSum, float thresh,
|
const gpu_gpair& colSum, float thresh,
|
||||||
const TrainParam ¶m) {
|
const TrainParam& param) {
|
||||||
bool missingLeft = true;
|
bool missingLeft = true;
|
||||||
// get the default direction for the current node
|
// get the default direction for the current node
|
||||||
gpu_gpair missing = n.gradSum - colSum;
|
gpu_gpair missing = n.gradSum - colSum;
|
||||||
@ -84,19 +83,17 @@ DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
|
|||||||
rGradSum = n.gradSum - lGradSum;
|
rGradSum = n.gradSum - lGradSum;
|
||||||
updateChildNodes(nodes, absNodeId, lGradSum, rGradSum, param);
|
updateChildNodes(nodes, absNodeId, lGradSum, rGradSum, param);
|
||||||
// update default-dir, threshold and feature id for current node
|
// update default-dir, threshold and feature id for current node
|
||||||
nodes[absNodeId].dir = missingLeft? LeftDir : RightDir;
|
nodes[absNodeId].dir = missingLeft ? LeftDir : RightDir;
|
||||||
nodes[absNodeId].colIdx = colId;
|
nodes[absNodeId].colIdx = colId;
|
||||||
nodes[absNodeId].threshold = thresh;
|
nodes[absNodeId].threshold = thresh;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename node_id_t, int BLKDIM=256>
|
template <typename node_id_t, int BLKDIM = 256>
|
||||||
__global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits,
|
__global__ void split2nodeKernel(
|
||||||
const gpu_gpair* gradScans,
|
Node<node_id_t>* nodes, const Split* nodeSplits, const gpu_gpair* gradScans,
|
||||||
const gpu_gpair* gradSums, const float* vals,
|
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
||||||
const int* colIds, const int* colOffsets,
|
const int* colOffsets, const node_id_t* nodeAssigns, int nUniqKeys,
|
||||||
const node_id_t* nodeAssigns, int nUniqKeys,
|
node_id_t nodeStart, int nCols, const TrainParam param) {
|
||||||
node_id_t nodeStart, int nCols,
|
|
||||||
const TrainParam param) {
|
|
||||||
int uid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
int uid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||||
if (uid >= nUniqKeys) {
|
if (uid >= nUniqKeys) {
|
||||||
return;
|
return;
|
||||||
@ -105,11 +102,11 @@ __global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits
|
|||||||
Split s = nodeSplits[uid];
|
Split s = nodeSplits[uid];
|
||||||
if (s.isSplittable(param.min_split_loss)) {
|
if (s.isSplittable(param.min_split_loss)) {
|
||||||
int idx = s.index;
|
int idx = s.index;
|
||||||
int nodeInstId = abs2uniqKey(idx, nodeAssigns, colIds, nodeStart,
|
int nodeInstId =
|
||||||
nUniqKeys);
|
abs2uniqKey(idx, nodeAssigns, colIds, nodeStart, nUniqKeys);
|
||||||
updateNodeAndChildren(nodes, s, nodes[absNodeId], absNodeId,
|
updateNodeAndChildren(nodes, s, nodes[absNodeId], absNodeId, colIds[idx],
|
||||||
colIds[idx], gradScans[idx],
|
gradScans[idx], gradSums[nodeInstId], vals[idx],
|
||||||
gradSums[nodeInstId], vals[idx], param);
|
param);
|
||||||
} else {
|
} else {
|
||||||
// cannot be split further, so this node is a leaf!
|
// cannot be split further, so this node is a leaf!
|
||||||
nodes[absNodeId].score = -FLT_MAX;
|
nodes[absNodeId].score = -FLT_MAX;
|
||||||
@ -129,20 +126,20 @@ __global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits
|
|||||||
* @param nUniqKeys number of nodes that we are currently working on
|
* @param nUniqKeys number of nodes that we are currently working on
|
||||||
* @param nodeStart start offset of the nodes in the overall BFS tree
|
* @param nodeStart start offset of the nodes in the overall BFS tree
|
||||||
* @param nCols number of columns
|
* @param nCols number of columns
|
||||||
* @param preUniquifiedKeys whether to uniquify the keys from inside kernel or not
|
* @param preUniquifiedKeys whether to uniquify the keys from inside kernel or
|
||||||
|
* not
|
||||||
* @param param the training parameter struct
|
* @param param the training parameter struct
|
||||||
*/
|
*/
|
||||||
template <typename node_id_t, int BLKDIM=256>
|
template <typename node_id_t, int BLKDIM = 256>
|
||||||
void split2node(Node<node_id_t>* nodes, const Split* nodeSplits, const gpu_gpair* gradScans,
|
void split2node(Node<node_id_t>* nodes, const Split* nodeSplits,
|
||||||
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
const gpu_gpair* gradScans, const gpu_gpair* gradSums,
|
||||||
const int* colOffsets, const node_id_t* nodeAssigns,
|
const float* vals, const int* colIds, const int* colOffsets,
|
||||||
int nUniqKeys, node_id_t nodeStart, int nCols,
|
const node_id_t* nodeAssigns, int nUniqKeys,
|
||||||
const TrainParam param) {
|
node_id_t nodeStart, int nCols, const TrainParam param) {
|
||||||
int nBlks = dh::div_round_up(nUniqKeys, BLKDIM);
|
int nBlks = dh::div_round_up(nUniqKeys, BLKDIM);
|
||||||
split2nodeKernel<<<nBlks,BLKDIM>>>(nodes, nodeSplits, gradScans, gradSums,
|
split2nodeKernel<<<nBlks, BLKDIM>>>(nodes, nodeSplits, gradScans, gradSums,
|
||||||
vals, colIds, colOffsets, nodeAssigns,
|
vals, colIds, colOffsets, nodeAssigns,
|
||||||
nUniqKeys, nodeStart, nCols,
|
nUniqKeys, nodeStart, nCols, param);
|
||||||
param);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace exact
|
} // namespace exact
|
||||||
|
|||||||
@ -73,11 +73,12 @@ struct GPUData {
|
|||||||
n_features, foffsets.data(), foffsets.data() + 1);
|
n_features, foffsets.data(), foffsets.data() + 1);
|
||||||
|
|
||||||
// Allocate memory
|
// Allocate memory
|
||||||
size_t free_memory = dh::available_memory(param_in.gpu_id);
|
size_t free_memory =
|
||||||
ba.allocate(param_in.gpu_id,
|
dh::available_memory(dh::get_device_idx(param_in.gpu_id));
|
||||||
&fvalues, in_fvalues.size(), &fvalues_temp,
|
ba.allocate(
|
||||||
in_fvalues.size(), &fvalues_cached, in_fvalues.size(), &foffsets,
|
dh::get_device_idx(param_in.gpu_id), &fvalues, in_fvalues.size(),
|
||||||
in_foffsets.size(), &instance_id, in_instance_id.size(),
|
&fvalues_temp, in_fvalues.size(), &fvalues_cached, in_fvalues.size(),
|
||||||
|
&foffsets, in_foffsets.size(), &instance_id, in_instance_id.size(),
|
||||||
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
|
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
|
||||||
in_instance_id.size(), &feature_id, in_feature_id.size(), &node_id,
|
in_instance_id.size(), &feature_id, in_feature_id.size(), &node_id,
|
||||||
in_fvalues.size(), &node_id_temp, in_fvalues.size(), &node_id_instance,
|
in_fvalues.size(), &node_id_temp, in_fvalues.size(), &node_id_instance,
|
||||||
@ -91,7 +92,7 @@ struct GPUData {
|
|||||||
const int mb_size = 1048576;
|
const int mb_size = 1048576;
|
||||||
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
||||||
<< free_memory / mb_size << " MB on "
|
<< free_memory / mb_size << " MB on "
|
||||||
<< dh::device_name(param_in.gpu_id);
|
<< dh::device_name(dh::get_device_idx(param_in.gpu_id));
|
||||||
}
|
}
|
||||||
|
|
||||||
fvalues_cached = in_fvalues;
|
fvalues_cached = in_fvalues;
|
||||||
|
|||||||
@ -125,7 +125,7 @@ void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
|
|||||||
// set dList member
|
// set dList member
|
||||||
dList.resize(n_devices);
|
dList.resize(n_devices);
|
||||||
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
||||||
int device_idx = (param.gpu_id + d_idx) % n_devices;
|
int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices();
|
||||||
dList[d_idx] = device_idx;
|
dList[d_idx] = device_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -141,7 +141,8 @@ void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
|
|||||||
|
|
||||||
// printf("# NCCL: Using devices\n");
|
// printf("# NCCL: Using devices\n");
|
||||||
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
||||||
streams[d_idx] = reinterpret_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t)));
|
streams[d_idx] =
|
||||||
|
reinterpret_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t)));
|
||||||
dh::safe_cuda(cudaSetDevice(dList[d_idx]));
|
dh::safe_cuda(cudaSetDevice(dList[d_idx]));
|
||||||
dh::safe_cuda(cudaStreamCreate(streams[d_idx]));
|
dh::safe_cuda(cudaStreamCreate(streams[d_idx]));
|
||||||
|
|
||||||
@ -159,10 +160,11 @@ void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
|
|||||||
// local find_split group of comms for each case of reduced number of GPUs
|
// local find_split group of comms for each case of reduced number of GPUs
|
||||||
// to use
|
// to use
|
||||||
find_split_comms.resize(
|
find_split_comms.resize(
|
||||||
n_devices, std::vector<ncclComm_t>(n_devices)); // TODO(JCM): Excessive, but
|
n_devices,
|
||||||
// ok, and best to do
|
std::vector<ncclComm_t>(n_devices)); // TODO(JCM): Excessive, but
|
||||||
// here instead of
|
// ok, and best to do
|
||||||
// repeatedly
|
// here instead of
|
||||||
|
// repeatedly
|
||||||
for (int num_d = 1; num_d <= n_devices;
|
for (int num_d = 1; num_d <= n_devices;
|
||||||
++num_d) { // loop over number of devices used
|
++num_d) { // loop over number of devices used
|
||||||
dh::safe_nccl(ncclCommInitAll(find_split_comms[num_d - 1].data(), num_d,
|
dh::safe_nccl(ncclCommInitAll(find_split_comms[num_d - 1].data(), num_d,
|
||||||
@ -377,7 +379,8 @@ void GPUHistBuilder::BuildHist(int depth) {
|
|||||||
#if (NCCL)
|
#if (NCCL)
|
||||||
// (in-place) reduce each element of histogram (for only current level) across
|
// (in-place) reduce each element of histogram (for only current level) across
|
||||||
// multiple gpus
|
// multiple gpus
|
||||||
// TODO(JCM): use out of place with pre-allocated buffer, but then have to copy
|
// TODO(JCM): use out of place with pre-allocated buffer, but then have to
|
||||||
|
// copy
|
||||||
// back on device
|
// back on device
|
||||||
// fprintf(stderr,"sizeof(gpu_gpair)/sizeof(float)=%d\n",sizeof(gpu_gpair)/sizeof(float));
|
// fprintf(stderr,"sizeof(gpu_gpair)/sizeof(float)=%d\n",sizeof(gpu_gpair)/sizeof(float));
|
||||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||||
@ -621,110 +624,128 @@ void GPUHistBuilder::LaunchFindSplit(int depth) {
|
|||||||
bool colsample =
|
bool colsample =
|
||||||
param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0;
|
param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0;
|
||||||
|
|
||||||
// use power of 2 for split finder because nodes are power of 2 (broadcast
|
int dosimuljob = 1;
|
||||||
// result to remaining devices)
|
|
||||||
int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
|
|
||||||
find_split_n_devices = std::min(n_nodes_level(depth), find_split_n_devices);
|
|
||||||
int num_nodes_device = n_nodes_level(depth) / find_split_n_devices;
|
|
||||||
int num_nodes_child_device = n_nodes_level(depth + 1) / find_split_n_devices;
|
|
||||||
const int GRID_SIZE = num_nodes_device;
|
|
||||||
|
|
||||||
#if (NCCL)
|
#if (NCCL)
|
||||||
// NOTE: No need to scatter before gather as all devices have same copy of
|
int simuljob = 1; // whether to do job on single GPU and broadcast (0) or to
|
||||||
// nodes, and within find_split_kernel() nodes_temp is given values from nodes
|
// do same job on each GPU (1) (could make user parameter,
|
||||||
|
// but too fine-grained maybe)
|
||||||
|
int findsplit_shardongpus = 0; // too expensive generally, disable for now
|
||||||
|
|
||||||
// for all nodes (split among devices) find best split per node
|
if (NCCL && findsplit_shardongpus) {
|
||||||
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
dosimuljob = 0;
|
||||||
int device_idx = dList[d_idx];
|
// use power of 2 for split finder because nodes are power of 2 (broadcast
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
// result to remaining devices)
|
||||||
|
int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
|
||||||
|
find_split_n_devices = std::min(n_nodes_level(depth), find_split_n_devices);
|
||||||
|
int num_nodes_device = n_nodes_level(depth) / find_split_n_devices;
|
||||||
|
int num_nodes_child_device =
|
||||||
|
n_nodes_level(depth + 1) / find_split_n_devices;
|
||||||
|
const int GRID_SIZE = num_nodes_device;
|
||||||
|
|
||||||
int nodes_offset_device = d_idx * num_nodes_device;
|
// NOTE: No need to scatter before gather as all devices have same copy of
|
||||||
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
// nodes, and within find_split_kernel() nodes_temp is given values from
|
||||||
(const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)),
|
// nodes
|
||||||
feature_segments[d_idx].data(), depth, (info->num_col),
|
|
||||||
(hmat_.row_ptr.back()), nodes[d_idx].data(),
|
|
||||||
nodes_temp[d_idx].data(), nodes_child_temp[d_idx].data(),
|
|
||||||
nodes_offset_device, fidx_min_map[d_idx].data(),
|
|
||||||
gidx_fvalue_map[d_idx].data(), gpu_param,
|
|
||||||
left_child_smallest_temp[d_idx].data(), colsample,
|
|
||||||
feature_flags[d_idx].data());
|
|
||||||
}
|
|
||||||
|
|
||||||
// nccl only on devices that did split
|
// for all nodes (split among devices) find best split per node
|
||||||
dh::synchronize_n_devices(find_split_n_devices, dList);
|
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
||||||
|
|
||||||
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
|
||||||
int device_idx = dList[d_idx];
|
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
||||||
|
|
||||||
dh::safe_nccl(ncclAllGather(
|
|
||||||
reinterpret_cast<const void*>(nodes_temp[d_idx].data()),
|
|
||||||
num_nodes_device * sizeof(Node) / sizeof(char), ncclChar,
|
|
||||||
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
|
||||||
find_split_comms[find_split_n_devices - 1][d_idx], *(streams[d_idx])));
|
|
||||||
|
|
||||||
if (depth !=
|
|
||||||
param.max_depth) { // don't copy over children nodes if no more nodes
|
|
||||||
dh::safe_nccl(
|
|
||||||
ncclAllGather(reinterpret_cast<const void*>(nodes_child_temp[d_idx].data()),
|
|
||||||
num_nodes_child_device * sizeof(Node) / sizeof(char),
|
|
||||||
ncclChar, reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
|
||||||
find_split_comms[find_split_n_devices - 1][d_idx],
|
|
||||||
*(streams[d_idx]))); // Note offset by n_nodes(depth)
|
|
||||||
// for recvbuff for child nodes
|
|
||||||
}
|
|
||||||
|
|
||||||
dh::safe_nccl(ncclAllGather(
|
|
||||||
reinterpret_cast<const void*>(left_child_smallest_temp[d_idx].data()),
|
|
||||||
num_nodes_device * sizeof(bool) / sizeof(char), ncclChar,
|
|
||||||
reinterpret_cast<void*>(left_child_smallest[d_idx].data() + n_nodes(depth - 1)),
|
|
||||||
find_split_comms[find_split_n_devices - 1][d_idx], *(streams[d_idx])));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
|
||||||
int device_idx = dList[d_idx];
|
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
||||||
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n_devices > find_split_n_devices && n_devices > 1) {
|
|
||||||
// if n_devices==1, no need to Bcast
|
|
||||||
// if find_split_n_devices==1, this is just a copy operation, else it copies
|
|
||||||
// from master to all nodes in case extra devices not involved in split
|
|
||||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
||||||
int device_idx = dList[d_idx];
|
int device_idx = dList[d_idx];
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
|
||||||
int master_device = dList[0];
|
int nodes_offset_device = d_idx * num_nodes_device;
|
||||||
dh::safe_nccl(
|
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
||||||
ncclBcast(reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
(const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)),
|
||||||
n_nodes_level(depth) * sizeof(Node) / sizeof(char),
|
feature_segments[d_idx].data(), depth, (info->num_col),
|
||||||
ncclChar, master_device, comms[d_idx], *(streams[d_idx])));
|
(hmat_.row_ptr.back()), nodes[d_idx].data(), nodes_temp[d_idx].data(),
|
||||||
|
nodes_child_temp[d_idx].data(), nodes_offset_device,
|
||||||
|
fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(), gpu_param,
|
||||||
|
left_child_smallest_temp[d_idx].data(), colsample,
|
||||||
|
feature_flags[d_idx].data());
|
||||||
|
}
|
||||||
|
|
||||||
|
// nccl only on devices that did split
|
||||||
|
dh::synchronize_n_devices(find_split_n_devices, dList);
|
||||||
|
|
||||||
|
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
||||||
|
int device_idx = dList[d_idx];
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
|
||||||
|
dh::safe_nccl(ncclAllGather(
|
||||||
|
reinterpret_cast<const void*>(nodes_temp[d_idx].data()),
|
||||||
|
num_nodes_device * sizeof(Node) / sizeof(char), ncclChar,
|
||||||
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
||||||
|
find_split_comms[find_split_n_devices - 1][d_idx],
|
||||||
|
*(streams[d_idx])));
|
||||||
|
|
||||||
if (depth !=
|
if (depth !=
|
||||||
param.max_depth) { // don't copy over children nodes if no more nodes
|
param.max_depth) { // don't copy over children nodes if no more nodes
|
||||||
dh::safe_nccl(ncclBcast(
|
dh::safe_nccl(ncclAllGather(
|
||||||
|
reinterpret_cast<const void*>(nodes_child_temp[d_idx].data()),
|
||||||
|
num_nodes_child_device * sizeof(Node) / sizeof(char), ncclChar,
|
||||||
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
||||||
n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
|
find_split_comms[find_split_n_devices - 1][d_idx],
|
||||||
master_device, comms[d_idx], *(streams[d_idx])));
|
*(streams[d_idx]))); // Note offset by n_nodes(depth)
|
||||||
|
// for recvbuff for child nodes
|
||||||
}
|
}
|
||||||
|
|
||||||
dh::safe_nccl(ncclBcast(
|
dh::safe_nccl(ncclAllGather(
|
||||||
reinterpret_cast<void*>(left_child_smallest[d_idx].data() + n_nodes(depth - 1)),
|
reinterpret_cast<const void*>(left_child_smallest_temp[d_idx].data()),
|
||||||
n_nodes_level(depth) * sizeof(bool) / sizeof(char), ncclChar,
|
num_nodes_device * sizeof(bool) / sizeof(char), ncclChar,
|
||||||
master_device, comms[d_idx], *(streams[d_idx])));
|
reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
|
||||||
|
n_nodes(depth - 1)),
|
||||||
|
find_split_comms[find_split_n_devices - 1][d_idx],
|
||||||
|
*(streams[d_idx])));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
||||||
int device_idx = dList[d_idx];
|
int device_idx = dList[d_idx];
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
if (n_devices > find_split_n_devices && n_devices > 1) {
|
||||||
{
|
// if n_devices==1, no need to Bcast
|
||||||
|
// if find_split_n_devices==1, this is just a copy operation, else it
|
||||||
|
// copies
|
||||||
|
// from master to all nodes in case extra devices not involved in split
|
||||||
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||||
|
int device_idx = dList[d_idx];
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
|
||||||
|
int master_device = dList[0];
|
||||||
|
dh::safe_nccl(ncclBcast(
|
||||||
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
||||||
|
n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar,
|
||||||
|
master_device, comms[d_idx], *(streams[d_idx])));
|
||||||
|
|
||||||
|
if (depth != param.max_depth) { // don't copy over children nodes if no
|
||||||
|
// more nodes
|
||||||
|
dh::safe_nccl(ncclBcast(
|
||||||
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
||||||
|
n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
|
||||||
|
master_device, comms[d_idx], *(streams[d_idx])));
|
||||||
|
}
|
||||||
|
|
||||||
|
dh::safe_nccl(ncclBcast(
|
||||||
|
reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
|
||||||
|
n_nodes(depth - 1)),
|
||||||
|
n_nodes_level(depth) * sizeof(bool) / sizeof(char), ncclChar,
|
||||||
|
master_device, comms[d_idx], *(streams[d_idx])));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||||
|
int device_idx = dList[d_idx];
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (simuljob == 0 && NCCL == 1) {
|
||||||
|
dosimuljob = 0;
|
||||||
|
int num_nodes_device = n_nodes_level(depth);
|
||||||
|
const int GRID_SIZE = num_nodes_device;
|
||||||
|
|
||||||
int d_idx = 0;
|
int d_idx = 0;
|
||||||
|
int master_device = dList[d_idx];
|
||||||
int device_idx = dList[d_idx];
|
int device_idx = dList[d_idx];
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
|
||||||
@ -737,9 +758,63 @@ void GPUHistBuilder::LaunchFindSplit(int depth) {
|
|||||||
gidx_fvalue_map[d_idx].data(), gpu_param,
|
gidx_fvalue_map[d_idx].data(), gpu_param,
|
||||||
left_child_smallest[d_idx].data(), colsample,
|
left_child_smallest[d_idx].data(), colsample,
|
||||||
feature_flags[d_idx].data());
|
feature_flags[d_idx].data());
|
||||||
|
|
||||||
|
// broadcast result
|
||||||
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||||
|
int device_idx = dList[d_idx];
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
|
||||||
|
dh::safe_nccl(ncclBcast(
|
||||||
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
||||||
|
n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar,
|
||||||
|
master_device, comms[d_idx], *(streams[d_idx])));
|
||||||
|
|
||||||
|
if (depth !=
|
||||||
|
param.max_depth) { // don't copy over children nodes if no more nodes
|
||||||
|
dh::safe_nccl(ncclBcast(
|
||||||
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
||||||
|
n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
|
||||||
|
master_device, comms[d_idx], *(streams[d_idx])));
|
||||||
|
}
|
||||||
|
|
||||||
|
dh::safe_nccl(
|
||||||
|
ncclBcast(reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
|
||||||
|
n_nodes(depth - 1)),
|
||||||
|
n_nodes_level(depth) * sizeof(bool) / sizeof(char),
|
||||||
|
ncclChar, master_device, comms[d_idx], *(streams[d_idx])));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||||
|
int device_idx = dList[d_idx];
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dosimuljob = 1;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (dosimuljob) { // if no NCCL or simuljob==1, do this
|
||||||
|
int num_nodes_device = n_nodes_level(depth);
|
||||||
|
const int GRID_SIZE = num_nodes_device;
|
||||||
|
|
||||||
|
// all GPUs do same work
|
||||||
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||||
|
int device_idx = dList[d_idx];
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
|
||||||
|
int nodes_offset_device = 0;
|
||||||
|
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
||||||
|
(const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)),
|
||||||
|
feature_segments[d_idx].data(), depth, (info->num_col),
|
||||||
|
(hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
|
||||||
|
nodes_offset_device, fidx_min_map[d_idx].data(),
|
||||||
|
gidx_fvalue_map[d_idx].data(), gpu_param,
|
||||||
|
left_child_smallest[d_idx].data(), colsample,
|
||||||
|
feature_flags[d_idx].data());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// NOTE: No need to syncrhonize with host as all above pure P2P ops or
|
// NOTE: No need to syncrhonize with host as all above pure P2P ops or
|
||||||
// on-device ops
|
// on-device ops
|
||||||
}
|
}
|
||||||
@ -776,17 +851,15 @@ void GPUHistBuilder::InitFirstNode(const std::vector<bst_gpair>& gpair) {
|
|||||||
|
|
||||||
std::vector<std::future<gpu_gpair>> future_results(n_devices);
|
std::vector<std::future<gpu_gpair>> future_results(n_devices);
|
||||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||||
int device_idx = dList[d_idx];
|
|
||||||
|
|
||||||
auto begin = device_gpair[d_idx].tbegin();
|
|
||||||
auto end = device_gpair[d_idx].tend();
|
|
||||||
gpu_gpair init = gpu_gpair();
|
|
||||||
auto binary_op = thrust::plus<gpu_gpair>();
|
|
||||||
|
|
||||||
// std::async captures the algorithm parameters by value
|
// std::async captures the algorithm parameters by value
|
||||||
// use std::launch::async to ensure the creation of a new thread
|
// use std::launch::async to ensure the creation of a new thread
|
||||||
future_results[d_idx] = std::async(std::launch::async, [=] {
|
future_results[d_idx] = std::async(std::launch::async, [=] {
|
||||||
|
int device_idx = dList[d_idx];
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
auto begin = device_gpair[d_idx].tbegin();
|
||||||
|
auto end = device_gpair[d_idx].tend();
|
||||||
|
gpu_gpair init = gpu_gpair();
|
||||||
|
auto binary_op = thrust::plus<gpu_gpair>();
|
||||||
return thrust::reduce(begin, end, init, binary_op);
|
return thrust::reduce(begin, end, init, binary_op);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -1047,8 +1120,8 @@ void GPUHistBuilder::Update(const std::vector<bst_gpair>& gpair,
|
|||||||
|
|
||||||
// done with multi-GPU, pass back result from master to tree on host
|
// done with multi-GPU, pass back result from master to tree on host
|
||||||
int master_device = dList[0];
|
int master_device = dList[0];
|
||||||
dense2sparse_tree(p_tree, nodes[master_device].tbegin(),
|
dh::safe_cuda(cudaSetDevice(master_device));
|
||||||
nodes[master_device].tend(), param);
|
dense2sparse_tree(p_tree, nodes[0].tbegin(), nodes[0].tend(), param);
|
||||||
}
|
}
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -157,11 +157,11 @@ class ColMaker: public TreeUpdater {
|
|||||||
feat_index.push_back(i);
|
feat_index.push_back(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
|
unsigned n = std::max(static_cast<unsigned>(1),
|
||||||
|
static_cast<unsigned>(param.colsample_bytree * feat_index.size()));
|
||||||
std::shuffle(feat_index.begin(), feat_index.end(), common::GlobalRandom());
|
std::shuffle(feat_index.begin(), feat_index.end(), common::GlobalRandom());
|
||||||
CHECK_GT(n, 0U)
|
CHECK_GT(param.colsample_bytree, 0U)
|
||||||
<< "colsample_bytree=" << param.colsample_bytree
|
<< "colsample_bytree cannot be zero.";
|
||||||
<< " is too small that no feature can be included";
|
|
||||||
feat_index.resize(n);
|
feat_index.resize(n);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@ -627,9 +627,10 @@ class ColMaker: public TreeUpdater {
|
|||||||
std::vector<bst_uint> feat_set = feat_index;
|
std::vector<bst_uint> feat_set = feat_index;
|
||||||
if (param.colsample_bylevel != 1.0f) {
|
if (param.colsample_bylevel != 1.0f) {
|
||||||
std::shuffle(feat_set.begin(), feat_set.end(), common::GlobalRandom());
|
std::shuffle(feat_set.begin(), feat_set.end(), common::GlobalRandom());
|
||||||
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
|
unsigned n = std::max(static_cast<unsigned>(1),
|
||||||
CHECK_GT(n, 0U)
|
static_cast<unsigned>(param.colsample_bylevel * feat_index.size()));
|
||||||
<< "colsample_bylevel is too small that no feature can be included";
|
CHECK_GT(param.colsample_bylevel, 0U)
|
||||||
|
<< "colsample_bylevel cannot be zero.";
|
||||||
feat_set.resize(n);
|
feat_set.resize(n);
|
||||||
}
|
}
|
||||||
dmlc::DataIter<ColBatch>* iter = p_fmat->ColIterator(feat_set);
|
dmlc::DataIter<ColBatch>* iter = p_fmat->ColIterator(feat_set);
|
||||||
|
|||||||
@ -409,11 +409,11 @@ class FastHistMaker: public TreeUpdater {
|
|||||||
feat_index.push_back(i);
|
feat_index.push_back(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
|
unsigned n = std::max(static_cast<unsigned>(1),
|
||||||
|
static_cast<unsigned>(param.colsample_bytree * feat_index.size()));
|
||||||
std::shuffle(feat_index.begin(), feat_index.end(), common::GlobalRandom());
|
std::shuffle(feat_index.begin(), feat_index.end(), common::GlobalRandom());
|
||||||
CHECK_GT(n, 0U)
|
CHECK_GT(param.colsample_bytree, 0U)
|
||||||
<< "colsample_bytree=" << param.colsample_bytree
|
<< "colsample_bytree cannot be zero.";
|
||||||
<< " is too small that no feature can be included";
|
|
||||||
feat_index.resize(n);
|
feat_index.resize(n);
|
||||||
}
|
}
|
||||||
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
|
if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user