Fix #3523: Fix CustomGlobalRandomEngine for R (#3781)

**Symptom** Apple Clang's implementation of `std::shuffle` expects doesn't work
correctly when it is run with the random bit generator for R package:
```cpp
CustomGlobalRandomEngine::result_type
CustomGlobalRandomEngine::operator()() {
  return static_cast<result_type>(
      std::floor(unif_rand() * CustomGlobalRandomEngine::max()));
}
```

Minimial reproduction of failure (compile using Apple Clang 10.0):
```cpp
std::vector<int> feature_set(100);
std::iota(feature_set.begin(), feature_set.end(), 0);
    // initialize with 0, 1, 2, 3, ..., 99
std::shuffle(feature_set.begin(), feature_set.end(), common::GlobalRandom());
    // This returns 0, 1, 2, ..., 99, so content didn't get shuffled at all!!!
```

Note that this bug is platform-dependent; it does not appear when GCC or
upstream LLVM Clang is used.

**Diagnosis** Apple Clang's `std::shuffle` expects 32-bit integer
inputs, whereas `CustomGlobalRandomEngine::operator()` produces 64-bit
integers.

**Fix** Have `CustomGlobalRandomEngine::operator()` produce 32-bit integers.

Closes #3523.
This commit is contained in:
Philip Hyunsu Cho 2018-10-15 09:39:13 -07:00 committed by GitHub
parent 4302fc4027
commit b38c636d05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 2 deletions

View File

@ -237,3 +237,28 @@ test_that("max_delta_step works", {
expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss)) expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
expect_lt(mean(bst1$evaluation_log$train_logloss)/mean(bst2$evaluation_log$train_logloss), 0.8) expect_lt(mean(bst1$evaluation_log$train_logloss)/mean(bst2$evaluation_log$train_logloss), 0.8)
}) })
test_that("colsample_bytree works", {
# Randomly generate data matrix by sampling from uniform distribution [-1, 1]
set.seed(1)
train_x <- matrix(runif(1000, min = -1, max = 1), ncol = 100)
train_y <- as.numeric(rowSums(train_x) > 0)
test_x <- matrix(runif(1000, min = -1, max = 1), ncol = 100)
test_y <- as.numeric(rowSums(test_x) > 0)
colnames(train_x) <- paste0("Feature_", sprintf("%03d", 1:100))
colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100))
dtrain <- xgb.DMatrix(train_x, label = train_y)
dtest <- xgb.DMatrix(test_x, label = test_y)
watchlist <- list(train = dtrain, eval = dtest)
# Use colsample_bytree = 0.01, so that roughly one out of 100 features is
# chosen for each tree
param <- list(max_depth = 2, eta = 0, silent = 1, nthread = 2,
colsample_bytree = 0.01, objective = "binary:logistic",
eval_metric = "auc")
set.seed(2)
bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0)
xgb.importance(model = bst)
# If colsample_bytree works properly, a variety of features should be used
# in the 100 trees
expect_gte(nrow(xgb.importance(model = bst)), 30)
})

View File

@ -33,14 +33,14 @@ using RandomEngine = std::mt19937;
class CustomGlobalRandomEngine { class CustomGlobalRandomEngine {
public: public:
/*! \brief The result type */ /*! \brief The result type */
typedef size_t result_type; using result_type = uint32_t;
/*! \brief The minimum of random numbers generated */ /*! \brief The minimum of random numbers generated */
inline static constexpr result_type min() { inline static constexpr result_type min() {
return 0; return 0;
} }
/*! \brief The maximum random numbers generated */ /*! \brief The maximum random numbers generated */
inline static constexpr result_type max() { inline static constexpr result_type max() {
return std::numeric_limits<size_t>::max(); return std::numeric_limits<result_type>::max();
} }
/*! /*!
* \brief seed function, to be implemented * \brief seed function, to be implemented