Feature weights (#5962)

2020-08-18 19:55:41 +08:00
parent a418278064
commit 4d99c58a5f
25 changed files with 509 additions and 104 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -455,7 +455,8 @@ class DMatrix:                  # pylint: disable=too-many-instance-attributes
                 label_lower_bound=None,
                 label_upper_bound=None,
                 feature_names=None,
-                 feature_types=None):
+                 feature_types=None,
+                 feature_weights=None):
        '''Set meta info for DMatrix.'''
        if label is not None:
            self.set_label(label)
@@ -473,6 +474,10 @@ class DMatrix:                  # pylint: disable=too-many-instance-attributes
            self.feature_names = feature_names
        if feature_types is not None:
            self.feature_types = feature_types
+        if feature_weights is not None:
+            from .data import dispatch_meta_backend
+            dispatch_meta_backend(matrix=self, data=feature_weights,
+                                  name='feature_weights')

    def get_float_info(self, field):
        """Get float property from the DMatrix.
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -530,22 +530,38 @@ def dispatch_data_backend(data, missing, threads,
    raise TypeError('Not supported type for data.' + str(type(data)))


+def _to_data_type(dtype: str, name: str):
+    dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
+    if dtype not in dtype_map.keys():
+        raise TypeError(
+            f'Expecting float32, float64, uint32, uint64, got {dtype} ' +
+            f'for {name}.')
+    return dtype_map[dtype]
+
+
+def _validate_meta_shape(data):
+    if hasattr(data, 'shape'):
+        assert len(data.shape) == 1 or (
+            len(data.shape) == 2 and
+            (data.shape[1] == 0 or data.shape[1] == 1))
+
+
 def _meta_from_numpy(data, field, dtype, handle):
    data = _maybe_np_slice(data, dtype)
-    if dtype == 'uint32':
-        c_data = c_array(ctypes.c_uint32, data)
-        _check_call(_LIB.XGDMatrixSetUIntInfo(handle,
-                                              c_str(field),
-                                              c_array(ctypes.c_uint, data),
-                                              c_bst_ulong(len(data))))
-    elif dtype == 'float':
-        c_data = c_array(ctypes.c_float, data)
-        _check_call(_LIB.XGDMatrixSetFloatInfo(handle,
-                                               c_str(field),
-                                               c_data,
-                                               c_bst_ulong(len(data))))
-    else:
-        raise TypeError('Unsupported type ' + str(dtype) + ' for:' + field)
+    interface = data.__array_interface__
+    assert interface.get('mask', None) is None, 'Masked array is not supported'
+    size = data.shape[0]
+
+    c_type = _to_data_type(str(data.dtype), field)
+    ptr = interface['data'][0]
+    ptr = ctypes.c_void_p(ptr)
+    _check_call(_LIB.XGDMatrixSetDenseInfo(
+        handle,
+        c_str(field),
+        ptr,
+        c_bst_ulong(size),
+        c_type
+    ))


 def _meta_from_list(data, field, dtype, handle):
@@ -595,6 +611,7 @@ def _meta_from_dt(data, field, dtype, handle):
 def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
    '''Dispatch for meta info.'''
    handle = matrix.handle
+    _validate_meta_shape(data)
    if data is None:
        return
    if _is_list(data):
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -441,6 +441,7 @@ class XGBModel(XGBModelBase):
    def fit(self, X, y, sample_weight=None, base_margin=None,
            eval_set=None, eval_metric=None, early_stopping_rounds=None,
            verbose=True, xgb_model=None, sample_weight_eval_set=None,
+            feature_weights=None,
            callbacks=None):
        # pylint: disable=invalid-name,attribute-defined-outside-init
        """Fit gradient boosting model
@@ -459,9 +460,6 @@ class XGBModel(XGBModelBase):
            A list of (X, y) tuple pairs to use as validation sets, for which
            metrics will be computed.
            Validation metrics will help us track the performance of the model.
-        sample_weight_eval_set : list, optional
-            A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of
-            instance weights on the i-th validation set.
        eval_metric : str, list of str, or callable, optional
            If a str, should be a built-in evaluation metric to use. See
            doc/parameter.rst.
@@ -490,6 +488,13 @@ class XGBModel(XGBModelBase):
        xgb_model : str
            file name of stored XGBoost model or 'Booster' instance XGBoost model to be
            loaded before training (allows training continuation).
+        sample_weight_eval_set : list, optional
+            A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of
+            instance weights on the i-th validation set.
+        feature_weights: array_like
+            Weight for each feature, defines the probability of each feature
+            being selected when colsample is being used.  All values must be
+            greater than 0, otherwise a `ValueError` is thrown.
        callbacks : list of callback functions
            List of callback functions that are applied at end of each iteration.
            It is possible to use predefined callbacks by using :ref:`callback_api`.
@@ -498,6 +503,7 @@ class XGBModel(XGBModelBase):
            .. code-block:: python

                [xgb.callback.reset_learning_rate(custom_rates)]
+
        """
        self.n_features_in_ = X.shape[1]

@@ -505,6 +511,7 @@ class XGBModel(XGBModelBase):
                                base_margin=base_margin,
                                missing=self.missing,
                                nthread=self.n_jobs)
+        train_dmatrix.set_info(feature_weights=feature_weights)

        evals_result = {}

@@ -759,7 +766,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
    def fit(self, X, y, sample_weight=None, base_margin=None,
            eval_set=None, eval_metric=None,
            early_stopping_rounds=None, verbose=True, xgb_model=None,
-            sample_weight_eval_set=None, callbacks=None):
+            sample_weight_eval_set=None, feature_weights=None, callbacks=None):
        # pylint: disable = attribute-defined-outside-init,arguments-differ

        evals_result = {}
@@ -821,6 +828,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
                                base_margin=base_margin,
                                missing=self.missing, nthread=self.n_jobs)
+        train_dmatrix.set_info(feature_weights=feature_weights)

        self._Booster = train(xgb_options, train_dmatrix,
                              self.get_num_boosting_rounds(),
@@ -1101,10 +1109,10 @@ class XGBRanker(XGBModel):
            raise ValueError("please use XGBRanker for ranking task")

    def fit(self, X, y, group, sample_weight=None, base_margin=None,
-            eval_set=None,
-            sample_weight_eval_set=None, eval_group=None, eval_metric=None,
+            eval_set=None, sample_weight_eval_set=None,
+            eval_group=None, eval_metric=None,
            early_stopping_rounds=None, verbose=False, xgb_model=None,
-            callbacks=None):
+            feature_weights=None, callbacks=None):
        # pylint: disable = attribute-defined-outside-init,arguments-differ
        """Fit gradient boosting ranker

@@ -1170,6 +1178,10 @@ class XGBRanker(XGBModel):
        xgb_model : str
            file name of stored XGBoost model or 'Booster' instance XGBoost
            model to be loaded before training (allows training continuation).
+        feature_weights: array_like
+            Weight for each feature, defines the probability of each feature
+            being selected when colsample is being used.  All values must be
+            greater than 0, otherwise a `ValueError` is thrown.
        callbacks : list of callback functions
            List of callback functions that are applied at end of each
            iteration.  It is possible to use predefined callbacks by using
@@ -1205,6 +1217,7 @@ class XGBRanker(XGBModel):
        train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight,
                                base_margin=base_margin,
                                missing=self.missing, nthread=self.n_jobs)
+        train_dmatrix.set_info(feature_weights=feature_weights)
        train_dmatrix.set_group(group)

        evals_result = {}