Compare commits

..

9 Commits

Author SHA1 Message Date
Jiaming Yuan
000292ce6d Bump release version to 1.3.3. (#6624) 2021-01-20 19:23:31 +08:00
Jiaming Yuan
d3ec116322 Revert ntree limit fix (#6616) (#6622)
The old (before fix) best_ntree_limit ignores the num_class parameters, which is incorrect. In before we workarounded it in c++ layer to avoid possible breaking changes on other language bindings. But the Python interpretation stayed incorrect. The PR fixed that in Python to consider num_class, but didn't remove the old workaround, so tree calculation in predictor is incorrect, see PredictBatch in CPUPredictor.
2021-01-20 04:20:07 +08:00
Jiaming Yuan
a018028471 Remove type check for solaris. (#6606) 2021-01-15 18:20:39 +08:00
fis
3e343159ef Release patch release 1.3.2 2021-01-13 17:35:00 +08:00
Jiaming Yuan
99e802f2ff Remove duplicated DMatrix. (#6592) (#6599) 2021-01-13 04:44:06 +08:00
Jiaming Yuan
6a29afb480 Fix evaluation result for XGBRanker. (#6594) (#6600)
* Remove duplicated code, which fixes typo `evals_result` -> `evals_result_`.
2021-01-13 04:42:43 +08:00
Jiaming Yuan
8e321adac8 Support Solaris. (#6578) (#6588)
* Add system header.

* Remove use of TR1 on Solaris

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
2021-01-11 02:31:29 +08:00
Jiaming Yuan
d0ec65520a [backport] Fix best_ntree_limit for dart and gblinear. (#6579) (#6587)
* [backport] Fix `best_ntree_limit` for dart and gblinear. (#6579)

* Backport num group test fix.
2021-01-11 01:46:05 +08:00
Jiaming Yuan
7aec915dcd [Backport] Rename data to X in predict_proba. (#6555) (#6586)
* [Breaking] Rename `data` to `X` in `predict_proba`. (#6555)

New Scikit-Learn version uses keyword argument, and `X` is the predefined
keyword.

* Use pip to install latest Python graphviz on Windows CI.

* Suppress health check.
2021-01-10 16:05:17 +08:00
23 changed files with 154 additions and 86 deletions

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.13) cmake_minimum_required(VERSION 3.13)
project(xgboost LANGUAGES CXX C VERSION 1.3.1) project(xgboost LANGUAGES CXX C VERSION 1.3.3)
include(cmake/Utils.cmake) include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0022 NEW)

View File

@@ -1,7 +1,7 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 1.3.1.1 Version: 1.3.3.1
Date: 2020-08-28 Date: 2020-08-28
Authors@R: c( Authors@R: c(
person("Tianqi", "Chen", role = c("aut"), person("Tianqi", "Chen", role = c("aut"),

View File

@@ -55,7 +55,7 @@
#endif // defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) #endif // defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4)
#if defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) && \ #if defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) && \
!defined(__CUDACC__) !defined(__CUDACC__) && !defined(__sun) && !defined(sun)
#include <parallel/algorithm> #include <parallel/algorithm>
#define XGBOOST_PARALLEL_SORT(X, Y, Z) __gnu_parallel::sort((X), (Y), (Z)) #define XGBOOST_PARALLEL_SORT(X, Y, Z) __gnu_parallel::sort((X), (Y), (Z))
#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) \ #define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) \

View File

@@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 1 #define XGBOOST_VER_MAJOR 1
#define XGBOOST_VER_MINOR 3 #define XGBOOST_VER_MINOR 3
#define XGBOOST_VER_PATCH 1 #define XGBOOST_VER_PATCH 3
#endif // XGBOOST_VERSION_CONFIG_H_ #endif // XGBOOST_VERSION_CONFIG_H_

View File

@@ -34,9 +34,9 @@ TO_VERSION=$2
sed_i() { sed_i() {
perl -p -000 -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2" perl -p -000 -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2"
} }
export -f sed_i export -f sed_i
BASEDIR=$(dirname $0)/.. BASEDIR=$(dirname $0)/..
find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \ find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
-exec bash -c \ -exec bash -c \

View File

@@ -6,7 +6,7 @@
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>XGBoost JVM Package</name> <name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description> <description>JVM Package for XGBoost</description>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</parent> </parent>
<artifactId>xgboost4j-example_2.12</artifactId> <artifactId>xgboost4j-example_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<build> <build>
<plugins> <plugins>
@@ -26,7 +26,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</parent> </parent>
<artifactId>xgboost4j-flink_2.12</artifactId> <artifactId>xgboost4j-flink_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@@ -26,7 +26,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</parent> </parent>
<artifactId>xgboost4j-gpu_2.12</artifactId> <artifactId>xgboost4j-gpu_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@@ -6,7 +6,7 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</parent> </parent>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId> <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build> <build>
@@ -24,7 +24,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@@ -6,7 +6,7 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</parent> </parent>
<artifactId>xgboost4j-spark_2.12</artifactId> <artifactId>xgboost4j-spark_2.12</artifactId>
<build> <build>
@@ -24,7 +24,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
</parent> </parent>
<artifactId>xgboost4j_2.12</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>1.3.1</version> <version>1.3.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@@ -1 +1 @@
1.3.1 1.3.3

View File

@@ -1210,10 +1210,10 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
verbose=verbose) verbose=verbose)
async def _predict_proba_async(self, data, output_margin=False, async def _predict_proba_async(self, X, output_margin=False,
base_margin=None): base_margin=None):
test_dmatrix = await DaskDMatrix( test_dmatrix = await DaskDMatrix(
client=self.client, data=data, base_margin=base_margin, client=self.client, data=X, base_margin=base_margin,
missing=self.missing missing=self.missing
) )
pred_probs = await predict(client=self.client, pred_probs = await predict(client=self.client,
@@ -1223,11 +1223,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
return pred_probs return pred_probs
# pylint: disable=arguments-differ,missing-docstring # pylint: disable=arguments-differ,missing-docstring
def predict_proba(self, data, output_margin=False, base_margin=None): def predict_proba(self, X, output_margin=False, base_margin=None):
_assert_dask_support() _assert_dask_support()
return self.client.sync( return self.client.sync(
self._predict_proba_async, self._predict_proba_async,
data, X=X,
output_margin=output_margin, output_margin=output_margin,
base_margin=base_margin base_margin=base_margin
) )

View File

@@ -4,6 +4,7 @@
import copy import copy
import warnings import warnings
import json import json
from typing import Optional
import numpy as np import numpy as np
from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args
from .training import train from .training import train
@@ -494,6 +495,13 @@ class XGBModel(XGBModelBase):
# Delete the attribute after load # Delete the attribute after load
self.get_booster().set_attr(scikit_learn=None) self.get_booster().set_attr(scikit_learn=None)
def _set_evaluation_result(self, evals_result: Optional[dict]) -> None:
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
self.evals_result_ = evals_result
@_deprecate_positional_args @_deprecate_positional_args
def fit(self, X, y, *, sample_weight=None, base_margin=None, def fit(self, X, y, *, sample_weight=None, base_margin=None,
eval_set=None, eval_metric=None, early_stopping_rounds=None, eval_set=None, eval_metric=None, early_stopping_rounds=None,
@@ -565,13 +573,6 @@ class XGBModel(XGBModelBase):
""" """
self.n_features_in_ = X.shape[1] self.n_features_in_ = X.shape[1]
train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight,
base_margin=base_margin,
missing=self.missing,
nthread=self.n_jobs)
train_dmatrix.set_info(feature_weights=feature_weights)
evals_result = {} evals_result = {}
train_dmatrix, evals = self._wrap_evaluation_matrices( train_dmatrix, evals = self._wrap_evaluation_matrices(
@@ -601,12 +602,7 @@ class XGBModel(XGBModelBase):
verbose_eval=verbose, xgb_model=xgb_model, verbose_eval=verbose, xgb_model=xgb_model,
callbacks=callbacks) callbacks=callbacks)
if evals_result: self._set_evaluation_result(evals_result)
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][
evals_result_key]
self.evals_result_ = evals_result
if early_stopping_rounds is not None: if early_stopping_rounds is not None:
self.best_score = self._Booster.best_score self.best_score = self._Booster.best_score
@@ -919,12 +915,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
callbacks=callbacks) callbacks=callbacks)
self.objective = xgb_options["objective"] self.objective = xgb_options["objective"]
if evals_result: self._set_evaluation_result(evals_result)
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][
evals_result_key] = val[1][evals_result_key]
self.evals_result_ = evals_result
if early_stopping_rounds is not None: if early_stopping_rounds is not None:
self.best_score = self._Booster.best_score self.best_score = self._Booster.best_score
@@ -995,10 +986,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
return self._le.inverse_transform(column_indexes) return self._le.inverse_transform(column_indexes)
return column_indexes return column_indexes
def predict_proba(self, data, ntree_limit=None, validate_features=False, def predict_proba(self, X, ntree_limit=None, validate_features=False,
base_margin=None): base_margin=None):
""" """ Predict the probability of each `X` example being of a given class.
Predict the probability of each `data` example being of a given class.
.. note:: This function is not thread safe .. note:: This function is not thread safe
@@ -1008,21 +998,22 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
Parameters Parameters
---------- ----------
data : array_like X : array_like
Feature matrix. Feature matrix.
ntree_limit : int ntree_limit : int
Limit number of trees in the prediction; defaults to best_ntree_limit if defined Limit number of trees in the prediction; defaults to best_ntree_limit if
(i.e. it has been trained with early stopping), otherwise 0 (use all trees). defined (i.e. it has been trained with early stopping), otherwise 0 (use all
trees).
validate_features : bool validate_features : bool
When this is True, validate that the Booster's and data's feature_names are identical. When this is True, validate that the Booster's and data's feature_names are
Otherwise, it is assumed that the feature_names are the same. identical. Otherwise, it is assumed that the feature_names are the same.
Returns Returns
------- -------
prediction : numpy array prediction : numpy array
a numpy array with the probability of each data example being of a given class. a numpy array with the probability of each data example being of a given class.
""" """
test_dmatrix = DMatrix(data, base_margin=base_margin, test_dmatrix = DMatrix(X, base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs) missing=self.missing, nthread=self.n_jobs)
if ntree_limit is None: if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0) ntree_limit = getattr(self, "best_ntree_limit", 0)
@@ -1328,12 +1319,7 @@ class XGBRanker(XGBModel):
self.objective = params["objective"] self.objective = params["objective"]
if evals_result: self._set_evaluation_result(evals_result)
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
self.evals_result = evals_result
if early_stopping_rounds is not None: if early_stopping_rounds is not None:
self.best_score = self._Booster.best_score self.best_score = self._Booster.best_score
self.best_iteration = self._Booster.best_iteration self.best_iteration = self._Booster.best_iteration

View File

@@ -4,6 +4,7 @@
"""Training Library containing training routines.""" """Training Library containing training routines."""
import warnings import warnings
import copy import copy
import json
import numpy as np import numpy as np
from .core import Booster, XGBoostError from .core import Booster, XGBoostError
@@ -123,7 +124,26 @@ def _train_internal(params, dtrain,
bst.best_iteration = int(bst.attr('best_iteration')) bst.best_iteration = int(bst.attr('best_iteration'))
else: else:
bst.best_iteration = nboost - 1 bst.best_iteration = nboost - 1
config = json.loads(bst.save_config())
booster = config['learner']['gradient_booster']['name']
if booster == 'gblinear':
num_parallel_tree = 0
elif booster == 'dart':
num_parallel_tree = int(
config['learner']['gradient_booster']['gbtree']['gbtree_train_param'][
'num_parallel_tree'
]
)
elif booster == 'gbtree':
num_parallel_tree = int(
config['learner']['gradient_booster']['gbtree_train_param'][
'num_parallel_tree']
)
else:
raise ValueError(f'Unknown booster: {booster}')
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
# Copy to serialise and unserialise booster to reset state and free # Copy to serialise and unserialise booster to reset state and free
# training memory # training memory
return bst.copy() return bst.copy()
@@ -162,9 +182,10 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
If there's more than one metric in the **eval_metric** parameter given in If there's more than one metric in the **eval_metric** parameter given in
**params**, the last metric will be used for early stopping. **params**, the last metric will be used for early stopping.
If early stopping occurs, the model will have three additional fields: If early stopping occurs, the model will have three additional fields:
``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``. ``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``. Use
(Use ``bst.best_ntree_limit`` to get the correct value if ``bst.best_ntree_limit`` to get the correct value if ``num_parallel_tree`` and/or
``num_parallel_tree`` and/or ``num_class`` appears in the parameters) ``num_class`` appears in the parameters. ``best_ntree_limit`` is the result of
``num_parallel_tree * best_iteration``.
evals_result: dict evals_result: dict
This dictionary stores the evaluation results of all the items in watchlist. This dictionary stores the evaluation results of all the items in watchlist.

View File

@@ -25,6 +25,10 @@
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#if defined(__sun) || defined(sun)
#include <sys/sockio.h>
#endif // defined(__sun) || defined(sun)
#endif // defined(_WIN32) #endif // defined(_WIN32)
#include <string> #include <string>

View File

@@ -10,10 +10,6 @@ namespace xgboost {
namespace gbm { namespace gbm {
void GBLinearModel::SaveModel(Json* p_out) const { void GBLinearModel::SaveModel(Json* p_out) const {
using WeightType = std::remove_reference<decltype(std::declval<decltype(weight)>().back())>::type;
using JsonFloat = Number::Float;
static_assert(std::is_same<WeightType, JsonFloat>::value,
"Weight type should be of the same type with JSON float");
auto& out = *p_out; auto& out = *p_out;
size_t const n_weights = weight.size(); size_t const n_weights = weight.size();

View File

@@ -9,7 +9,6 @@ dependencies:
- scikit-learn - scikit-learn
- pandas - pandas
- pytest - pytest
- python-graphviz
- boto3 - boto3
- hypothesis - hypothesis
- jsonschema - jsonschema
@@ -17,3 +16,4 @@ dependencies:
- pip: - pip:
- cupy-cuda101 - cupy-cuda101
- modin[all] - modin[all]
- graphviz

View File

@@ -5,8 +5,10 @@ import numpy as np
import asyncio import asyncio
import xgboost import xgboost
import subprocess import subprocess
import hypothesis
from hypothesis import given, strategies, settings, note from hypothesis import given, strategies, settings, note
from hypothesis._settings import duration from hypothesis._settings import duration
from hypothesis import HealthCheck
from test_gpu_updaters import parameter_strategy from test_gpu_updaters import parameter_strategy
if sys.platform.startswith("win"): if sys.platform.startswith("win"):
@@ -19,6 +21,11 @@ from test_with_dask import _get_client_workers # noqa
from test_with_dask import generate_array # noqa from test_with_dask import generate_array # noqa
import testing as tm # noqa import testing as tm # noqa
if hasattr(HealthCheck, 'function_scoped_fixture'):
suppress = [HealthCheck.function_scoped_fixture]
else:
suppress = hypothesis.utils.conventions.not_set
try: try:
import dask.dataframe as dd import dask.dataframe as dd
@@ -161,19 +168,24 @@ class TestDistributedGPU:
run_with_dask_dataframe(dxgb.DaskDMatrix, client) run_with_dask_dataframe(dxgb.DaskDMatrix, client)
run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client) run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)
@given(params=parameter_strategy, num_rounds=strategies.integers(1, 20), @given(
dataset=tm.dataset_strategy) params=parameter_strategy,
@settings(deadline=duration(seconds=120)) num_rounds=strategies.integers(1, 20),
dataset=tm.dataset_strategy,
)
@settings(deadline=duration(seconds=120), suppress_health_check=suppress)
@pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.skipif(**tm.no_dask_cuda())
@pytest.mark.parametrize('local_cuda_cluster', [{'n_workers': 2}], indirect=['local_cuda_cluster']) @pytest.mark.parametrize(
"local_cuda_cluster", [{"n_workers": 2}], indirect=["local_cuda_cluster"]
)
@pytest.mark.mgpu @pytest.mark.mgpu
def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster): def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster):
with Client(local_cuda_cluster) as client: with Client(local_cuda_cluster) as client:
run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, client)
client) run_gpu_hist(
run_gpu_hist(params, num_rounds, dataset, params, num_rounds, dataset, dxgb.DaskDeviceQuantileDMatrix, client
dxgb.DaskDeviceQuantileDMatrix, client) )
@pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask())

View File

@@ -33,9 +33,15 @@ def run_predict_leaf(predictor):
y = rng.randint(low=0, high=classes, size=rows) y = rng.randint(low=0, high=classes, size=rows)
m = xgb.DMatrix(X, y) m = xgb.DMatrix(X, y)
booster = xgb.train( booster = xgb.train(
{'num_parallel_tree': num_parallel_tree, 'num_class': classes, {
'predictor': predictor, 'tree_method': 'hist'}, m, "num_parallel_tree": num_parallel_tree,
num_boost_round=num_boost_round) "num_class": classes,
"predictor": predictor,
"tree_method": "hist",
},
m,
num_boost_round=num_boost_round,
)
empty = xgb.DMatrix(np.ones(shape=(0, cols))) empty = xgb.DMatrix(np.ones(shape=(0, cols)))
empty_leaf = booster.predict(empty, pred_leaf=True) empty_leaf = booster.predict(empty, pred_leaf=True)
@@ -52,12 +58,19 @@ def run_predict_leaf(predictor):
end = classes * num_parallel_tree * (j + 1) end = classes * num_parallel_tree * (j + 1)
layer = row[start: end] layer = row[start: end]
for c in range(classes): for c in range(classes):
tree_group = layer[c * num_parallel_tree: tree_group = layer[c * num_parallel_tree: (c + 1) * num_parallel_tree]
(c+1) * num_parallel_tree]
assert tree_group.shape[0] == num_parallel_tree assert tree_group.shape[0] == num_parallel_tree
# no subsampling so tree in same forest should output same # no subsampling so tree in same forest should output same
# leaf. # leaf.
assert np.all(tree_group == tree_group[0]) assert np.all(tree_group == tree_group[0])
ntree_limit = 2
sliced = booster.predict(
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit
)
first = sliced[0, ...]
assert first.shape[0] == classes * num_parallel_tree * ntree_limit
return leaf return leaf

View File

@@ -8,7 +8,8 @@ import asyncio
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
import os import os
import subprocess import subprocess
from hypothesis import given, settings, note import hypothesis
from hypothesis import given, settings, note, HealthCheck
from test_updaters import hist_parameter_strategy, exact_parameter_strategy from test_updaters import hist_parameter_strategy, exact_parameter_strategy
if sys.platform.startswith("win"): if sys.platform.startswith("win"):
@@ -17,6 +18,12 @@ if tm.no_dask()['condition']:
pytest.skip(msg=tm.no_dask()['reason'], allow_module_level=True) pytest.skip(msg=tm.no_dask()['reason'], allow_module_level=True)
if hasattr(HealthCheck, 'function_scoped_fixture'):
suppress = [HealthCheck.function_scoped_fixture]
else:
suppress = hypothesis.utils.conventions.not_set
try: try:
from distributed import LocalCluster, Client, get_client from distributed import LocalCluster, Client, get_client
from distributed.utils_test import client, loop, cluster_fixture from distributed.utils_test import client, loop, cluster_fixture
@@ -668,14 +675,14 @@ class TestWithDask:
@given(params=hist_parameter_strategy, @given(params=hist_parameter_strategy,
dataset=tm.dataset_strategy) dataset=tm.dataset_strategy)
@settings(deadline=None) @settings(deadline=None, suppress_health_check=suppress)
def test_hist(self, params, dataset, client): def test_hist(self, params, dataset, client):
num_rounds = 30 num_rounds = 30
self.run_updater_test(client, params, num_rounds, dataset, 'hist') self.run_updater_test(client, params, num_rounds, dataset, 'hist')
@given(params=exact_parameter_strategy, @given(params=exact_parameter_strategy,
dataset=tm.dataset_strategy) dataset=tm.dataset_strategy)
@settings(deadline=None) @settings(deadline=None, suppress_health_check=suppress)
def test_approx(self, client, params, dataset): def test_approx(self, client, params, dataset):
num_rounds = 30 num_rounds = 30
self.run_updater_test(client, params, num_rounds, dataset, 'approx') self.run_updater_test(client, params, num_rounds, dataset, 'approx')
@@ -795,7 +802,6 @@ class TestDaskCallbacks:
merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')]) merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')])
assert len(merged) == 2 assert len(merged) == 2
def test_data_initialization(self): def test_data_initialization(self):
'''Assert each worker has the correct amount of data, and DMatrix initialization doesn't '''Assert each worker has the correct amount of data, and DMatrix initialization doesn't
generate unnecessary copies of data. generate unnecessary copies of data.

View File

@@ -78,6 +78,34 @@ def test_multiclass_classification():
check_pred(preds4, labels, output_margin=False) check_pred(preds4, labels, output_margin=False)
def test_best_ntree_limit():
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
def train(booster, forest):
rounds = 4
cls = xgb.XGBClassifier(
n_estimators=rounds, num_parallel_tree=forest, booster=booster
).fit(
X, y, eval_set=[(X, y)], early_stopping_rounds=3
)
if forest:
assert cls.best_ntree_limit == rounds * forest
else:
assert cls.best_ntree_limit == 0
# best_ntree_limit is used by default, assert that under gblinear it's
# automatically ignored due to being 0.
cls.predict(X)
num_parallel_tree = 4
train('gbtree', num_parallel_tree)
train('dart', num_parallel_tree)
train('gblinear', None)
def test_ranking(): def test_ranking():
# generate random data # generate random data
x_train = np.random.rand(1000, 10) x_train = np.random.rand(1000, 10)
@@ -94,6 +122,8 @@ def test_ranking():
model = xgb.sklearn.XGBRanker(**params) model = xgb.sklearn.XGBRanker(**params)
model.fit(x_train, y_train, group=train_group, model.fit(x_train, y_train, group=train_group,
eval_set=[(x_valid, y_valid)], eval_group=[valid_group]) eval_set=[(x_valid, y_valid)], eval_group=[valid_group])
assert model.evals_result()
pred = model.predict(x_test) pred = model.predict(x_test)
train_data = xgb.DMatrix(x_train, y_train) train_data = xgb.DMatrix(x_train, y_train)