diff --git a/CMakeLists.txt b/CMakeLists.txt index fec99aed0..a3c7c783b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,7 +99,7 @@ if(PLUGIN_UPDATER_GPU) include_directories(${CUB_DIRECTORY}) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--expt-extended-lambda;-arch=compute_35") if(NOT MSVC) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-std=c++11; -Xcompiler -fPIC") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC") endif() set(SOURCES ${SOURCES} plugin/updater_gpu/src/updater_gpu.cc diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 000000000..172cb9cb8 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,23 @@ +environment: + matrix: + - solution_name: C:/projects/xgboost/build2013/xgboost.sln + - solution_name: C:/projects/xgboost/build2015/xgboost.sln +platform: + - x64 + +configuration: + - Debug + - Release + +install: + - git submodule update --init --recursive + +before_build: + - mkdir build2013 + - mkdir build2015 + - cd build2013 + - cmake .. -G"Visual Studio 12 2013 Win64" + - cd ../build2015 + - cmake .. -G"Visual Studio 14 2015 Win64" +build_script: + - msbuild %solution_name% diff --git a/plugin/updater_gpu/src/cuda_helpers.cuh b/plugin/updater_gpu/src/cuda_helpers.cuh index d46814e93..bd6fae113 100644 --- a/plugin/updater_gpu/src/cuda_helpers.cuh +++ b/plugin/updater_gpu/src/cuda_helpers.cuh @@ -274,3 +274,7 @@ template __device__ range block_stride_range(T begin, T end) { template T *raw(thrust::device_vector &v) { // NOLINT return raw_pointer_cast(v.data()); } + +template size_t size_bytes(const thrust::device_vector &v) { + return sizeof(T) * v.size(); +} diff --git a/plugin/updater_gpu/src/gpu_builder.cu b/plugin/updater_gpu/src/gpu_builder.cu index 3e8583942..266bb31d9 100644 --- a/plugin/updater_gpu/src/gpu_builder.cu +++ b/plugin/updater_gpu/src/gpu_builder.cu @@ -57,44 +57,66 @@ struct GPUData { const std::vector &in_gpair, bst_uint n_instances_in, bst_uint n_features_in, int max_depth, const TrainParam ¶m_in) { Timer t; + + // Track allocated device memory + size_t n_bytes = 0; + n_features = n_features_in; n_instances = n_instances_in; fvalues = in_fvalues; + n_bytes += size_bytes(fvalues); foffsets = in_foffsets; + n_bytes += size_bytes(foffsets); instance_id = in_instance_id; + n_bytes += size_bytes(instance_id); feature_id = in_feature_id; + n_bytes += size_bytes(feature_id); param = GPUTrainingParam(param_in.min_child_weight, param_in.reg_lambda, param_in.reg_alpha, param_in.max_delta_step); gpair = thrust::device_vector(in_gpair.begin(), in_gpair.end()); + n_bytes += size_bytes(gpair); uint32_t max_nodes_level = 1 << max_depth; node_sums = thrust::device_vector(max_nodes_level * n_features); + n_bytes += size_bytes(node_sums); node_offsets = thrust::device_vector(max_nodes_level * n_features); + n_bytes += size_bytes(node_offsets); node_id_instance = thrust::device_vector(n_instances, 0); + n_bytes += size_bytes(node_id_instance); node_id = thrust::device_vector(fvalues.size(), 0); + n_bytes += size_bytes(node_id); node_id_temp = thrust::device_vector(fvalues.size()); + n_bytes += size_bytes(node_id_temp); uint32_t max_nodes = (1 << (max_depth + 1)) - 1; nodes = thrust::device_vector(max_nodes); + n_bytes += size_bytes(nodes); split_candidates = thrust::device_vector(max_nodes_level * n_features); - allocated = true; + n_bytes += size_bytes(split_candidates); // Init items items = thrust::device_vector(fvalues.size()); + n_bytes += size_bytes(items); items_temp = thrust::device_vector(fvalues.size()); + n_bytes += size_bytes(items_temp); sort_index_in = thrust::device_vector(fvalues.size()); + n_bytes += size_bytes(sort_index_in); sort_index_out = thrust::device_vector(fvalues.size()); + n_bytes += size_bytes(sort_index_out); + + // std::cout << "Device memory allocated: " << n_bytes << "\n"; this->CreateItems(); + allocated = true; } ~GPUData() {} @@ -150,7 +172,10 @@ struct GPUData { GPUBuilder::GPUBuilder() { gpu_data = new GPUData(); } -void GPUBuilder::Init(const TrainParam ¶m_in) { param = param_in; } +void GPUBuilder::Init(const TrainParam ¶m_in) { + param = param_in; + CHECK(param.max_depth < 16) << "Max depth > 15 not supported."; +} GPUBuilder::~GPUBuilder() { delete gpu_data; } @@ -186,8 +211,9 @@ template __global__ void update_nodeid_fvalue_kernel(NodeIdT *d_node_id, NodeIdT *d_node_id_instance, Item *d_items, Node *d_nodes, const int n_nodes, - const int *d_feature_id, const size_t n, - const int n_features, bool cache_nodes) { + const int *d_foffsets, const int *d_feature_id, + const size_t n, const int n_features, + bool cache_nodes) { // Load nodes into shared memory extern __shared__ Node s_nodes[]; @@ -254,8 +280,8 @@ void GPUBuilder::UpdateNodeId(int level) { ITEMS_PER_THREAD><<>>( raw(gpu_data->node_id), raw(gpu_data->node_id_instance), raw(gpu_data->items), raw(gpu_data->nodes), n_nodes, - raw(gpu_data->feature_id), gpu_data->fvalues.size(), - gpu_data->n_features, cache_nodes); + raw(gpu_data->foffsets), raw(gpu_data->feature_id), + gpu_data->fvalues.size(), gpu_data->n_features, cache_nodes); safe_cuda(cudaGetLastError()); safe_cuda(cudaDeviceSynchronize()); @@ -267,35 +293,42 @@ void GPUBuilder::UpdateNodeId(int level) { void GPUBuilder::Sort(int level) { thrust::sequence(gpu_data->sort_index_in.begin(), gpu_data->sort_index_in.end()); + + cub::DoubleBuffer d_keys(raw(gpu_data->node_id), + raw(gpu_data->node_id_temp)); + cub::DoubleBuffer d_values(raw(gpu_data->sort_index_in), + raw(gpu_data->sort_index_out)); + if (!gpu_data->cub_mem.IsAllocated()) { cub::DeviceSegmentedRadixSort::SortPairs( gpu_data->cub_mem.d_temp_storage, gpu_data->cub_mem.temp_storage_bytes, - raw(gpu_data->node_id), raw(gpu_data->node_id_temp), - raw(gpu_data->sort_index_in), raw(gpu_data->sort_index_out), - gpu_data->fvalues.size(), gpu_data->n_features, raw(gpu_data->foffsets), - raw(gpu_data->foffsets) + 1); + d_keys, d_values, gpu_data->fvalues.size(), gpu_data->n_features, + raw(gpu_data->foffsets), raw(gpu_data->foffsets) + 1); gpu_data->cub_mem.Allocate(); } cub::DeviceSegmentedRadixSort::SortPairs( gpu_data->cub_mem.d_temp_storage, gpu_data->cub_mem.temp_storage_bytes, - raw(gpu_data->node_id), raw(gpu_data->node_id_temp), - raw(gpu_data->sort_index_in), raw(gpu_data->sort_index_out), - gpu_data->fvalues.size(), gpu_data->n_features, raw(gpu_data->foffsets), - raw(gpu_data->foffsets) + 1); + d_keys, d_values, gpu_data->fvalues.size(), gpu_data->n_features, + raw(gpu_data->foffsets), raw(gpu_data->foffsets) + 1); - thrust::gather(gpu_data->sort_index_out.begin(), - gpu_data->sort_index_out.end(), gpu_data->items.begin(), - gpu_data->items_temp.begin()); + thrust::gather(thrust::device_pointer_cast(d_values.Current()), + thrust::device_pointer_cast(d_values.Current()) + + gpu_data->sort_index_out.size(), + gpu_data->items.begin(), gpu_data->items_temp.begin()); thrust::copy(gpu_data->items_temp.begin(), gpu_data->items_temp.end(), gpu_data->items.begin()); - thrust::copy(gpu_data->node_id_temp.begin(), gpu_data->node_id_temp.end(), - gpu_data->node_id.begin()); + + if (d_keys.Current() == raw(gpu_data->node_id_temp)) { + thrust::copy(gpu_data->node_id_temp.begin(), gpu_data->node_id_temp.end(), + gpu_data->node_id.begin()); + } } void GPUBuilder::Update(const std::vector &gpair, DMatrix *p_fmat, RegTree *p_tree) { + cudaProfilerStart(); try { Timer update; Timer t; @@ -336,7 +369,14 @@ void GPUBuilder::Update(const std::vector &gpair, DMatrix *p_fmat, } catch (thrust::system_error &e) { std::cerr << "CUDA error: " << e.what() << std::endl; exit(-1); + } catch (const std::exception &e) { + std::cerr << "Error: " << e.what() << std::endl; + exit(-1); + } catch (...) { + std::cerr << "Unknown exception." << std::endl; + exit(-1); } + cudaProfilerStop(); } void GPUBuilder::InitData(const std::vector &gpair, DMatrix &fmat, diff --git a/plugin/updater_gpu/src/types.cuh b/plugin/updater_gpu/src/types.cuh index 8a9984416..63689b812 100644 --- a/plugin/updater_gpu/src/types.cuh +++ b/plugin/updater_gpu/src/types.cuh @@ -7,7 +7,7 @@ namespace xgboost { namespace tree { -typedef int32_t NodeIdT; +typedef int16_t NodeIdT; // gpair type defined with device accessible functions struct gpu_gpair {