/*! * Copyright 2016 Rory mitchell */ #include #include #include #include #include #include #include #include #include #include #include #include #include "../../../src/common/random.h" #include "device_helpers.cuh" #include "find_split.cuh" #include "gpu_builder.cuh" #include "types_functions.cuh" namespace xgboost { namespace tree { struct GPUData { GPUData() : allocated(false), n_features(0), n_instances(0) {} bool allocated; int n_features; int n_instances; dh::bulk_allocator ba; GPUTrainingParam param; dh::dvec fvalues; dh::dvec fvalues_temp; dh::dvec fvalues_cached; dh::dvec foffsets; dh::dvec instance_id; dh::dvec instance_id_temp; dh::dvec instance_id_cached; dh::dvec feature_id; dh::dvec node_id; dh::dvec node_id_temp; dh::dvec node_id_instance; dh::dvec gpair; dh::dvec nodes; dh::dvec split_candidates; dh::dvec node_sums; dh::dvec node_offsets; dh::dvec sort_index_in; dh::dvec sort_index_out; dh::dvec cub_mem; ItemIter items_iter; void Init(const std::vector &in_fvalues, const std::vector &in_foffsets, const std::vector &in_instance_id, const std::vector &in_feature_id, const std::vector &in_gpair, bst_uint n_instances_in, bst_uint n_features_in, int max_depth, const TrainParam ¶m_in) { n_features = n_features_in; n_instances = n_instances_in; uint32_t max_nodes = (1 << (max_depth + 1)) - 1; uint32_t max_nodes_level = 1 << max_depth; // Calculate memory for sort size_t cub_mem_size = 0; cub::DoubleBuffer db_key; cub::DoubleBuffer db_value; cub::DeviceSegmentedRadixSort::SortPairs( cub_mem.data(), cub_mem_size, db_key, db_value, in_fvalues.size(), n_features, foffsets.data(), foffsets.data() + 1); // Allocate memory size_t free_memory = dh::available_memory(); ba.allocate(&fvalues, in_fvalues.size(), &fvalues_temp, in_fvalues.size(), &fvalues_cached, in_fvalues.size(), &foffsets, in_foffsets.size(), &instance_id, in_instance_id.size(), &instance_id_temp, in_instance_id.size(), &instance_id_cached, in_instance_id.size(), &feature_id, in_feature_id.size(), &node_id, in_fvalues.size(), &node_id_temp, in_fvalues.size(), &node_id_instance, n_instances, &gpair, n_instances, &nodes, max_nodes, &split_candidates, max_nodes_level * n_features, &node_sums, max_nodes_level * n_features, &node_offsets, max_nodes_level * n_features, &sort_index_in, in_fvalues.size(), &sort_index_out, in_fvalues.size(), &cub_mem, cub_mem_size); if (!param_in.silent) { const int mb_size = 1048576; LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/" << free_memory / mb_size << " MB on " << dh::device_name(); } node_id.fill(0); node_id_instance.fill(0); fvalues = in_fvalues; fvalues_cached = fvalues; foffsets = in_foffsets; instance_id = in_instance_id; instance_id_cached = instance_id; feature_id = in_feature_id; param = GPUTrainingParam(param_in.min_child_weight, param_in.reg_lambda, param_in.reg_alpha, param_in.max_delta_step); gpair = in_gpair; nodes.fill(Node()); items_iter = thrust::make_zip_iterator(thrust::make_tuple( thrust::make_permutation_iterator(gpair.tbegin(), instance_id.tbegin()), fvalues.tbegin(), node_id.tbegin())); allocated = true; dh::safe_cuda(cudaGetLastError()); } ~GPUData() {} // Reset memory for new boosting iteration void Reset(const std::vector &in_gpair) { CHECK(allocated); gpair = in_gpair; instance_id = instance_id_cached; fvalues = fvalues_cached; nodes.fill(Node()); node_id_instance.fill(0); node_id.fill(0); } bool IsAllocated() { return allocated; } // Gather from node_id_instance into node_id according to instance_id void GatherNodeId() { // Update node_id for each item auto d_node_id = node_id.data(); auto d_node_id_instance = node_id_instance.data(); auto d_instance_id = instance_id.data(); dh::launch_n(fvalues.size(), [=] __device__(bst_uint i) { // Item item = d_items[i]; d_node_id[i] = d_node_id_instance[d_instance_id[i]]; }); } }; GPUBuilder::GPUBuilder() { gpu_data = new GPUData(); } void GPUBuilder::Init(const TrainParam ¶m_in) { param = param_in; CHECK(param.max_depth < 16) << "Tree depth too large."; } GPUBuilder::~GPUBuilder() { delete gpu_data; } void GPUBuilder::UpdateNodeId(int level) { auto *d_node_id_instance = gpu_data->node_id_instance.data(); Node *d_nodes = gpu_data->nodes.data(); dh::launch_n(gpu_data->node_id_instance.size(), [=] __device__(int i) { NodeIdT item_node_id = d_node_id_instance[i]; if (item_node_id < 0) { return; } Node node = d_nodes[item_node_id]; if (node.IsLeaf()) { d_node_id_instance[i] = -1; } else if (node.split.missing_left) { d_node_id_instance[i] = item_node_id * 2 + 1; } else { d_node_id_instance[i] = item_node_id * 2 + 2; } }); dh::safe_cuda(cudaDeviceSynchronize()); auto *d_fvalues = gpu_data->fvalues.data(); auto *d_instance_id = gpu_data->instance_id.data(); auto *d_node_id = gpu_data->node_id.data(); auto *d_feature_id = gpu_data->feature_id.data(); // Update node based on fvalue where exists dh::launch_n(gpu_data->fvalues.size(), [=] __device__(int i) { NodeIdT item_node_id = d_node_id[i]; if (item_node_id < 0) { return; } Node node = d_nodes[item_node_id]; if (node.IsLeaf()) { return; } int feature_id = d_feature_id[i]; if (feature_id == node.split.findex) { float fvalue = d_fvalues[i]; bst_uint instance_id = d_instance_id[i]; if (fvalue < node.split.fvalue) { d_node_id_instance[instance_id] = item_node_id * 2 + 1; } else { d_node_id_instance[instance_id] = item_node_id * 2 + 2; } } }); dh::safe_cuda(cudaDeviceSynchronize()); gpu_data->GatherNodeId(); } void GPUBuilder::Sort(int level) { thrust::sequence(gpu_data->sort_index_in.tbegin(), gpu_data->sort_index_in.tend()); cub::DoubleBuffer d_keys(gpu_data->node_id.data(), gpu_data->node_id_temp.data()); cub::DoubleBuffer d_values(gpu_data->sort_index_in.data(), gpu_data->sort_index_out.data()); size_t temp_size = gpu_data->cub_mem.size(); cub::DeviceSegmentedRadixSort::SortPairs( gpu_data->cub_mem.data(), temp_size, d_keys, d_values, gpu_data->fvalues.size(), gpu_data->n_features, gpu_data->foffsets.data(), gpu_data->foffsets.data() + 1); auto zip = thrust::make_zip_iterator(thrust::make_tuple( gpu_data->fvalues.tbegin(), gpu_data->instance_id.tbegin())); auto zip_temp = thrust::make_zip_iterator(thrust::make_tuple( gpu_data->fvalues_temp.tbegin(), gpu_data->instance_id_temp.tbegin())); thrust::gather(thrust::device_pointer_cast(d_values.Current()), thrust::device_pointer_cast(d_values.Current()) + gpu_data->sort_index_out.size(), zip, zip_temp); thrust::copy(zip_temp, zip_temp + gpu_data->fvalues.size(), zip); if (d_keys.Current() == gpu_data->node_id_temp.data()) { thrust::copy(gpu_data->node_id_temp.tbegin(), gpu_data->node_id_temp.tend(), gpu_data->node_id.tbegin()); } } void GPUBuilder::Update(const std::vector &gpair, DMatrix *p_fmat, RegTree *p_tree) { cudaProfilerStart(); try { dh::Timer update; dh::Timer t; this->InitData(gpair, *p_fmat, *p_tree); t.printElapsed("init data"); this->InitFirstNode(); for (int level = 0; level < param.max_depth; level++) { bool use_multiscan_algorithm = level < multiscan_levels; t.reset(); if (level > 0) { dh::Timer update_node; this->UpdateNodeId(level); update_node.printElapsed("node"); } if (level > 0 && !use_multiscan_algorithm) { dh::Timer s; this->Sort(level); s.printElapsed("sort"); } dh::Timer split; find_split(gpu_data->items_iter, gpu_data->split_candidates.data(), gpu_data->nodes.data(), (bst_uint)gpu_data->fvalues.size(), gpu_data->n_features, gpu_data->foffsets.data(), gpu_data->node_sums.data(), gpu_data->node_offsets.data(), gpu_data->param, level, use_multiscan_algorithm); split.printElapsed("split"); t.printElapsed("level"); } this->CopyTree(*p_tree); update.printElapsed("update"); } catch (thrust::system_error &e) { std::cerr << "CUDA error: " << e.what() << std::endl; exit(-1); } catch (const std::exception &e) { std::cerr << "Error: " << e.what() << std::endl; exit(-1); } catch (...) { std::cerr << "Unknown exception." << std::endl; exit(-1); } cudaProfilerStop(); } float GPUBuilder::GetSubsamplingRate(MetaInfo info) { float subsample = 1.0; uint32_t max_nodes = (1 << (param.max_depth + 1)) - 1; uint32_t max_nodes_level = 1 << param.max_depth; size_t required = 10 * info.num_row + 40 * info.num_nonzero + 64 * max_nodes + 76 * max_nodes_level * info.num_col; std::cout << "required: " << required << "\n"; size_t available = dh::available_memory(); while (available < required) { subsample -= 0.05; required = 10 * info.num_row + subsample * (44 * info.num_nonzero); } return subsample; } void GPUBuilder::InitData(const std::vector &gpair, DMatrix &fmat, const RegTree &tree) { CHECK(fmat.SingleColBlock()) << "GPUMaker: must have single column block"; if (gpu_data->IsAllocated()) { gpu_data->Reset(gpair); return; } dh::Timer t; MetaInfo info = fmat.info(); // Work out if dataset will fit on GPU float subsample = this->GetSubsamplingRate(info); CHECK(subsample > 0.0); if (!param.silent && subsample < param.subsample) { LOG(CONSOLE) << "Not enough device memory for entire dataset."; } // Override subsample parameter if user-specified parameter is lower subsample = std::min(param.subsample, subsample); std::vector row_flags; if (subsample < 1.0) { if (!param.silent && subsample < 1.0) { LOG(CONSOLE) << "Subsampling " << subsample * 100 << "% of rows."; } const RowSet &rowset = fmat.buffered_rowset(); row_flags.resize(info.num_row); std::bernoulli_distribution coin_flip(subsample); auto &rnd = common::GlobalRandom(); for (size_t i = 0; i < rowset.size(); ++i) { const bst_uint ridx = rowset[i]; if (gpair[ridx].hess < 0.0f) continue; row_flags[ridx] = coin_flip(rnd); } } std::vector foffsets; foffsets.push_back(0); std::vector feature_id; std::vector fvalues; std::vector instance_id; fvalues.reserve(info.num_col * info.num_row); instance_id.reserve(info.num_col * info.num_row); feature_id.reserve(info.num_col * info.num_row); dmlc::DataIter *iter = fmat.ColIterator(); while (iter->Next()) { const ColBatch &batch = iter->Value(); for (int i = 0; i < batch.size; i++) { const ColBatch::Inst &col = batch[i]; for (const ColBatch::Entry *it = col.data; it != col.data + col.length; it++) { bst_uint inst_id = it->index; if (subsample < 1.0) { if (row_flags[inst_id]) { fvalues.push_back(it->fvalue); instance_id.push_back(inst_id); feature_id.push_back(i); } } else { fvalues.push_back(it->fvalue); instance_id.push_back(inst_id); feature_id.push_back(i); } } foffsets.push_back(fvalues.size()); } } t.printElapsed("dmatrix"); t.reset(); gpu_data->Init(fvalues, foffsets, instance_id, feature_id, gpair, info.num_row, info.num_col, param.max_depth, param); t.printElapsed("gpu init"); } void GPUBuilder::InitFirstNode() { // Build the root node on the CPU and copy to device gpu_gpair sum_gradients = thrust::reduce(gpu_data->gpair.tbegin(), gpu_data->gpair.tend(), gpu_gpair(0, 0), cub::Sum()); Node tmp = Node( sum_gradients, CalcGain(gpu_data->param, sum_gradients.grad(), sum_gradients.hess()), CalcWeight(gpu_data->param, sum_gradients.grad(), sum_gradients.hess())); thrust::copy_n(&tmp, 1, gpu_data->nodes.tbegin()); } enum NodeType { NODE = 0, LEAF = 1, UNUSED = 2, }; // Recursively label node types void flag_nodes(const thrust::host_vector &nodes, std::vector *node_flags, int nid, NodeType type) { if (nid >= nodes.size() || type == UNUSED) { return; } const Node &n = nodes[nid]; // Current node and all children are valid if (n.split.loss_chg > rt_eps) { (*node_flags)[nid] = NODE; flag_nodes(nodes, node_flags, nid * 2 + 1, NODE); flag_nodes(nodes, node_flags, nid * 2 + 2, NODE); } else { // Current node is leaf, therefore is valid but all children are invalid (*node_flags)[nid] = LEAF; flag_nodes(nodes, node_flags, nid * 2 + 1, UNUSED); flag_nodes(nodes, node_flags, nid * 2 + 2, UNUSED); } } // Copy gpu dense representation of tree to xgboost sparse representation void GPUBuilder::CopyTree(RegTree &tree) { std::vector h_nodes = gpu_data->nodes.as_vector(); std::vector node_flags(h_nodes.size(), UNUSED); flag_nodes(h_nodes, &node_flags, 0, NODE); int nid = 0; for (int gpu_nid = 0; gpu_nid < h_nodes.size(); gpu_nid++) { NodeType flag = node_flags[gpu_nid]; const Node &n = h_nodes[gpu_nid]; if (flag == NODE) { tree.AddChilds(nid); tree[nid].set_split(n.split.findex, n.split.fvalue, n.split.missing_left); tree.stat(nid).loss_chg = n.split.loss_chg; tree.stat(nid).base_weight = n.weight; tree.stat(nid).sum_hess = n.sum_gradients.hess(); tree[tree[nid].cleft()].set_leaf(0); tree[tree[nid].cright()].set_leaf(0); nid++; } else if (flag == LEAF) { tree[nid].set_leaf(n.weight * param.learning_rate); tree.stat(nid).sum_hess = n.sum_gradients.hess(); nid++; } } } } // namespace tree } // namespace xgboost