Feature weights (#5962)

This commit is contained in:
Jiaming Yuan
2020-08-18 19:55:41 +08:00
committed by GitHub
parent a418278064
commit 4d99c58a5f
25 changed files with 509 additions and 104 deletions

View File

@@ -455,7 +455,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
label_lower_bound=None,
label_upper_bound=None,
feature_names=None,
feature_types=None):
feature_types=None,
feature_weights=None):
'''Set meta info for DMatrix.'''
if label is not None:
self.set_label(label)
@@ -473,6 +474,10 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
self.feature_names = feature_names
if feature_types is not None:
self.feature_types = feature_types
if feature_weights is not None:
from .data import dispatch_meta_backend
dispatch_meta_backend(matrix=self, data=feature_weights,
name='feature_weights')
def get_float_info(self, field):
"""Get float property from the DMatrix.

View File

@@ -530,22 +530,38 @@ def dispatch_data_backend(data, missing, threads,
raise TypeError('Not supported type for data.' + str(type(data)))
def _to_data_type(dtype: str, name: str):
dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
if dtype not in dtype_map.keys():
raise TypeError(
f'Expecting float32, float64, uint32, uint64, got {dtype} ' +
f'for {name}.')
return dtype_map[dtype]
def _validate_meta_shape(data):
if hasattr(data, 'shape'):
assert len(data.shape) == 1 or (
len(data.shape) == 2 and
(data.shape[1] == 0 or data.shape[1] == 1))
def _meta_from_numpy(data, field, dtype, handle):
data = _maybe_np_slice(data, dtype)
if dtype == 'uint32':
c_data = c_array(ctypes.c_uint32, data)
_check_call(_LIB.XGDMatrixSetUIntInfo(handle,
c_str(field),
c_array(ctypes.c_uint, data),
c_bst_ulong(len(data))))
elif dtype == 'float':
c_data = c_array(ctypes.c_float, data)
_check_call(_LIB.XGDMatrixSetFloatInfo(handle,
c_str(field),
c_data,
c_bst_ulong(len(data))))
else:
raise TypeError('Unsupported type ' + str(dtype) + ' for:' + field)
interface = data.__array_interface__
assert interface.get('mask', None) is None, 'Masked array is not supported'
size = data.shape[0]
c_type = _to_data_type(str(data.dtype), field)
ptr = interface['data'][0]
ptr = ctypes.c_void_p(ptr)
_check_call(_LIB.XGDMatrixSetDenseInfo(
handle,
c_str(field),
ptr,
c_bst_ulong(size),
c_type
))
def _meta_from_list(data, field, dtype, handle):
@@ -595,6 +611,7 @@ def _meta_from_dt(data, field, dtype, handle):
def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
'''Dispatch for meta info.'''
handle = matrix.handle
_validate_meta_shape(data)
if data is None:
return
if _is_list(data):

View File

@@ -441,6 +441,7 @@ class XGBModel(XGBModelBase):
def fit(self, X, y, sample_weight=None, base_margin=None,
eval_set=None, eval_metric=None, early_stopping_rounds=None,
verbose=True, xgb_model=None, sample_weight_eval_set=None,
feature_weights=None,
callbacks=None):
# pylint: disable=invalid-name,attribute-defined-outside-init
"""Fit gradient boosting model
@@ -459,9 +460,6 @@ class XGBModel(XGBModelBase):
A list of (X, y) tuple pairs to use as validation sets, for which
metrics will be computed.
Validation metrics will help us track the performance of the model.
sample_weight_eval_set : list, optional
A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of
instance weights on the i-th validation set.
eval_metric : str, list of str, or callable, optional
If a str, should be a built-in evaluation metric to use. See
doc/parameter.rst.
@@ -490,6 +488,13 @@ class XGBModel(XGBModelBase):
xgb_model : str
file name of stored XGBoost model or 'Booster' instance XGBoost model to be
loaded before training (allows training continuation).
sample_weight_eval_set : list, optional
A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of
instance weights on the i-th validation set.
feature_weights: array_like
Weight for each feature, defines the probability of each feature
being selected when colsample is being used. All values must be
greater than 0, otherwise a `ValueError` is thrown.
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using :ref:`callback_api`.
@@ -498,6 +503,7 @@ class XGBModel(XGBModelBase):
.. code-block:: python
[xgb.callback.reset_learning_rate(custom_rates)]
"""
self.n_features_in_ = X.shape[1]
@@ -505,6 +511,7 @@ class XGBModel(XGBModelBase):
base_margin=base_margin,
missing=self.missing,
nthread=self.n_jobs)
train_dmatrix.set_info(feature_weights=feature_weights)
evals_result = {}
@@ -759,7 +766,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
def fit(self, X, y, sample_weight=None, base_margin=None,
eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, xgb_model=None,
sample_weight_eval_set=None, callbacks=None):
sample_weight_eval_set=None, feature_weights=None, callbacks=None):
# pylint: disable = attribute-defined-outside-init,arguments-differ
evals_result = {}
@@ -821,6 +828,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs)
train_dmatrix.set_info(feature_weights=feature_weights)
self._Booster = train(xgb_options, train_dmatrix,
self.get_num_boosting_rounds(),
@@ -1101,10 +1109,10 @@ class XGBRanker(XGBModel):
raise ValueError("please use XGBRanker for ranking task")
def fit(self, X, y, group, sample_weight=None, base_margin=None,
eval_set=None,
sample_weight_eval_set=None, eval_group=None, eval_metric=None,
eval_set=None, sample_weight_eval_set=None,
eval_group=None, eval_metric=None,
early_stopping_rounds=None, verbose=False, xgb_model=None,
callbacks=None):
feature_weights=None, callbacks=None):
# pylint: disable = attribute-defined-outside-init,arguments-differ
"""Fit gradient boosting ranker
@@ -1170,6 +1178,10 @@ class XGBRanker(XGBModel):
xgb_model : str
file name of stored XGBoost model or 'Booster' instance XGBoost
model to be loaded before training (allows training continuation).
feature_weights: array_like
Weight for each feature, defines the probability of each feature
being selected when colsample is being used. All values must be
greater than 0, otherwise a `ValueError` is thrown.
callbacks : list of callback functions
List of callback functions that are applied at end of each
iteration. It is possible to use predefined callbacks by using
@@ -1205,6 +1217,7 @@ class XGBRanker(XGBModel):
train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight,
base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs)
train_dmatrix.set_info(feature_weights=feature_weights)
train_dmatrix.set_group(group)
evals_result = {}