From b38c636d0510db61e61bd486e8a53178d0f286d3 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Mon, 15 Oct 2018 09:39:13 -0700 Subject: [PATCH] Fix #3523: Fix CustomGlobalRandomEngine for R (#3781) **Symptom** Apple Clang's implementation of `std::shuffle` expects doesn't work correctly when it is run with the random bit generator for R package: ```cpp CustomGlobalRandomEngine::result_type CustomGlobalRandomEngine::operator()() { return static_cast( std::floor(unif_rand() * CustomGlobalRandomEngine::max())); } ``` Minimial reproduction of failure (compile using Apple Clang 10.0): ```cpp std::vector feature_set(100); std::iota(feature_set.begin(), feature_set.end(), 0); // initialize with 0, 1, 2, 3, ..., 99 std::shuffle(feature_set.begin(), feature_set.end(), common::GlobalRandom()); // This returns 0, 1, 2, ..., 99, so content didn't get shuffled at all!!! ``` Note that this bug is platform-dependent; it does not appear when GCC or upstream LLVM Clang is used. **Diagnosis** Apple Clang's `std::shuffle` expects 32-bit integer inputs, whereas `CustomGlobalRandomEngine::operator()` produces 64-bit integers. **Fix** Have `CustomGlobalRandomEngine::operator()` produce 32-bit integers. Closes #3523. --- R-package/tests/testthat/test_basic.R | 25 +++++++++++++++++++++++++ src/common/random.h | 4 ++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index c9cf73581..6b1331a84 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -237,3 +237,28 @@ test_that("max_delta_step works", { expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss)) expect_lt(mean(bst1$evaluation_log$train_logloss)/mean(bst2$evaluation_log$train_logloss), 0.8) }) + +test_that("colsample_bytree works", { + # Randomly generate data matrix by sampling from uniform distribution [-1, 1] + set.seed(1) + train_x <- matrix(runif(1000, min = -1, max = 1), ncol = 100) + train_y <- as.numeric(rowSums(train_x) > 0) + test_x <- matrix(runif(1000, min = -1, max = 1), ncol = 100) + test_y <- as.numeric(rowSums(test_x) > 0) + colnames(train_x) <- paste0("Feature_", sprintf("%03d", 1:100)) + colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100)) + dtrain <- xgb.DMatrix(train_x, label = train_y) + dtest <- xgb.DMatrix(test_x, label = test_y) + watchlist <- list(train = dtrain, eval = dtest) + # Use colsample_bytree = 0.01, so that roughly one out of 100 features is + # chosen for each tree + param <- list(max_depth = 2, eta = 0, silent = 1, nthread = 2, + colsample_bytree = 0.01, objective = "binary:logistic", + eval_metric = "auc") + set.seed(2) + bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0) + xgb.importance(model = bst) + # If colsample_bytree works properly, a variety of features should be used + # in the 100 trees + expect_gte(nrow(xgb.importance(model = bst)), 30) +}) diff --git a/src/common/random.h b/src/common/random.h index 93041e9d0..237134721 100644 --- a/src/common/random.h +++ b/src/common/random.h @@ -33,14 +33,14 @@ using RandomEngine = std::mt19937; class CustomGlobalRandomEngine { public: /*! \brief The result type */ - typedef size_t result_type; + using result_type = uint32_t; /*! \brief The minimum of random numbers generated */ inline static constexpr result_type min() { return 0; } /*! \brief The maximum random numbers generated */ inline static constexpr result_type max() { - return std::numeric_limits::max(); + return std::numeric_limits::max(); } /*! * \brief seed function, to be implemented