**Symptom** Apple Clang's implementation of `std::shuffle` expects doesn't work
correctly when it is run with the random bit generator for R package:
```cpp
CustomGlobalRandomEngine::result_type
CustomGlobalRandomEngine::operator()() {
return static_cast<result_type>(
std::floor(unif_rand() * CustomGlobalRandomEngine::max()));
}
```
Minimial reproduction of failure (compile using Apple Clang 10.0):
```cpp
std::vector<int> feature_set(100);
std::iota(feature_set.begin(), feature_set.end(), 0);
// initialize with 0, 1, 2, 3, ..., 99
std::shuffle(feature_set.begin(), feature_set.end(), common::GlobalRandom());
// This returns 0, 1, 2, ..., 99, so content didn't get shuffled at all!!!
```
Note that this bug is platform-dependent; it does not appear when GCC or
upstream LLVM Clang is used.
**Diagnosis** Apple Clang's `std::shuffle` expects 32-bit integer
inputs, whereas `CustomGlobalRandomEngine::operator()` produces 64-bit
integers.
**Fix** Have `CustomGlobalRandomEngine::operator()` produce 32-bit integers.
Closes #3523.
This commit is contained in:
parent
4302fc4027
commit
b38c636d05
@ -237,3 +237,28 @@ test_that("max_delta_step works", {
|
|||||||
expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
|
expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
|
||||||
expect_lt(mean(bst1$evaluation_log$train_logloss)/mean(bst2$evaluation_log$train_logloss), 0.8)
|
expect_lt(mean(bst1$evaluation_log$train_logloss)/mean(bst2$evaluation_log$train_logloss), 0.8)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("colsample_bytree works", {
|
||||||
|
# Randomly generate data matrix by sampling from uniform distribution [-1, 1]
|
||||||
|
set.seed(1)
|
||||||
|
train_x <- matrix(runif(1000, min = -1, max = 1), ncol = 100)
|
||||||
|
train_y <- as.numeric(rowSums(train_x) > 0)
|
||||||
|
test_x <- matrix(runif(1000, min = -1, max = 1), ncol = 100)
|
||||||
|
test_y <- as.numeric(rowSums(test_x) > 0)
|
||||||
|
colnames(train_x) <- paste0("Feature_", sprintf("%03d", 1:100))
|
||||||
|
colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100))
|
||||||
|
dtrain <- xgb.DMatrix(train_x, label = train_y)
|
||||||
|
dtest <- xgb.DMatrix(test_x, label = test_y)
|
||||||
|
watchlist <- list(train = dtrain, eval = dtest)
|
||||||
|
# Use colsample_bytree = 0.01, so that roughly one out of 100 features is
|
||||||
|
# chosen for each tree
|
||||||
|
param <- list(max_depth = 2, eta = 0, silent = 1, nthread = 2,
|
||||||
|
colsample_bytree = 0.01, objective = "binary:logistic",
|
||||||
|
eval_metric = "auc")
|
||||||
|
set.seed(2)
|
||||||
|
bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0)
|
||||||
|
xgb.importance(model = bst)
|
||||||
|
# If colsample_bytree works properly, a variety of features should be used
|
||||||
|
# in the 100 trees
|
||||||
|
expect_gte(nrow(xgb.importance(model = bst)), 30)
|
||||||
|
})
|
||||||
|
|||||||
@ -33,14 +33,14 @@ using RandomEngine = std::mt19937;
|
|||||||
class CustomGlobalRandomEngine {
|
class CustomGlobalRandomEngine {
|
||||||
public:
|
public:
|
||||||
/*! \brief The result type */
|
/*! \brief The result type */
|
||||||
typedef size_t result_type;
|
using result_type = uint32_t;
|
||||||
/*! \brief The minimum of random numbers generated */
|
/*! \brief The minimum of random numbers generated */
|
||||||
inline static constexpr result_type min() {
|
inline static constexpr result_type min() {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
/*! \brief The maximum random numbers generated */
|
/*! \brief The maximum random numbers generated */
|
||||||
inline static constexpr result_type max() {
|
inline static constexpr result_type max() {
|
||||||
return std::numeric_limits<size_t>::max();
|
return std::numeric_limits<result_type>::max();
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief seed function, to be implemented
|
* \brief seed function, to be implemented
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user