Merge pull request #503 from sinhrks/feature_types
Python: Add feature_types to DMatrix
This commit is contained in:
commit
db490d1c75
@ -146,10 +146,12 @@ class DMatrix(object):
|
|||||||
You can construct DMatrix from numpy.arrays
|
You can construct DMatrix from numpy.arrays
|
||||||
"""
|
"""
|
||||||
|
|
||||||
feature_names = None # for previous version's pickle
|
_feature_names = None # for previous version's pickle
|
||||||
|
_feature_types = None
|
||||||
|
|
||||||
def __init__(self, data, label=None, missing=0.0,
|
def __init__(self, data, label=None, missing=0.0,
|
||||||
weight=None, silent=False, feature_names=None):
|
weight=None, silent=False,
|
||||||
|
feature_names=None, feature_types=None):
|
||||||
"""
|
"""
|
||||||
Data matrix used in XGBoost.
|
Data matrix used in XGBoost.
|
||||||
|
|
||||||
@ -169,6 +171,8 @@ class DMatrix(object):
|
|||||||
Whether print messages during construction
|
Whether print messages during construction
|
||||||
feature_names : list, optional
|
feature_names : list, optional
|
||||||
Labels for features.
|
Labels for features.
|
||||||
|
feature_types : list, optional
|
||||||
|
Labels for features.
|
||||||
"""
|
"""
|
||||||
# force into void_p, mac need to pass things in as void_p
|
# force into void_p, mac need to pass things in as void_p
|
||||||
if data is None:
|
if data is None:
|
||||||
@ -196,20 +200,8 @@ class DMatrix(object):
|
|||||||
if weight is not None:
|
if weight is not None:
|
||||||
self.set_weight(weight)
|
self.set_weight(weight)
|
||||||
|
|
||||||
# validate feature name
|
|
||||||
if not feature_names is None:
|
|
||||||
if not isinstance(feature_names, list):
|
|
||||||
feature_names = list(feature_names)
|
|
||||||
if len(feature_names) != len(set(feature_names)):
|
|
||||||
raise ValueError('feature_names must be unique')
|
|
||||||
if len(feature_names) != self.num_col():
|
|
||||||
msg = 'feature_names must have the same length as data'
|
|
||||||
raise ValueError(msg)
|
|
||||||
# prohibit to use symbols may affect to parse. e.g. ``[]=.``
|
|
||||||
if not all(isinstance(f, STRING_TYPES) and f.isalnum()
|
|
||||||
for f in feature_names):
|
|
||||||
raise ValueError('all feature_names must be alphanumerics')
|
|
||||||
self.feature_names = feature_names
|
self.feature_names = feature_names
|
||||||
|
self.feature_types = feature_types
|
||||||
|
|
||||||
def _init_from_csr(self, csr):
|
def _init_from_csr(self, csr):
|
||||||
"""
|
"""
|
||||||
@ -461,6 +453,88 @@ class DMatrix(object):
|
|||||||
ctypes.byref(res.handle)))
|
ctypes.byref(res.handle)))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_names(self):
|
||||||
|
"""Get feature names (column labels).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
feature_names : list or None
|
||||||
|
"""
|
||||||
|
return self._feature_names
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_types(self):
|
||||||
|
"""Get feature types (column types).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
feature_types : list or None
|
||||||
|
"""
|
||||||
|
return self._feature_types
|
||||||
|
|
||||||
|
@feature_names.setter
|
||||||
|
def feature_names(self, feature_names):
|
||||||
|
"""Set feature names (column labels).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
feature_names : list or None
|
||||||
|
Labels for features. None will reset existing feature names
|
||||||
|
"""
|
||||||
|
if not feature_names is None:
|
||||||
|
# validate feature name
|
||||||
|
if not isinstance(feature_names, list):
|
||||||
|
feature_names = list(feature_names)
|
||||||
|
if len(feature_names) != len(set(feature_names)):
|
||||||
|
raise ValueError('feature_names must be unique')
|
||||||
|
if len(feature_names) != self.num_col():
|
||||||
|
msg = 'feature_names must have the same length as data'
|
||||||
|
raise ValueError(msg)
|
||||||
|
# prohibit to use symbols may affect to parse. e.g. ``[]=.``
|
||||||
|
if not all(isinstance(f, STRING_TYPES) and f.isalnum()
|
||||||
|
for f in feature_names):
|
||||||
|
raise ValueError('all feature_names must be alphanumerics')
|
||||||
|
else:
|
||||||
|
# reset feature_types also
|
||||||
|
self.feature_types = None
|
||||||
|
self._feature_names = feature_names
|
||||||
|
|
||||||
|
@feature_types.setter
|
||||||
|
def feature_types(self, feature_types):
|
||||||
|
"""Set feature types (column types).
|
||||||
|
|
||||||
|
This is for displaying the results and unrelated
|
||||||
|
to the learning process.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
feature_types : list or None
|
||||||
|
Labels for features. None will reset existing feature names
|
||||||
|
"""
|
||||||
|
if not feature_types is None:
|
||||||
|
|
||||||
|
if self.feature_names is None:
|
||||||
|
msg = 'Unable to set feature types before setting names'
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
if isinstance(feature_types, STRING_TYPES):
|
||||||
|
# single string will be applied to all columns
|
||||||
|
feature_types = [feature_types] * self.num_col()
|
||||||
|
|
||||||
|
if not isinstance(feature_types, list):
|
||||||
|
feature_types = list(feature_types)
|
||||||
|
if len(feature_types) != self.num_col():
|
||||||
|
msg = 'feature_types must have the same length as data'
|
||||||
|
raise ValueError(msg)
|
||||||
|
# prohibit to use symbols may affect to parse. e.g. ``[]=.``
|
||||||
|
|
||||||
|
valid = ('q', 'i', 'int', 'float')
|
||||||
|
if not all(isinstance(f, STRING_TYPES) and f in valid
|
||||||
|
for f in feature_types):
|
||||||
|
raise ValueError('all feature_names must be {i, q, int, float}')
|
||||||
|
self._feature_types = feature_types
|
||||||
|
|
||||||
|
|
||||||
class Booster(object):
|
class Booster(object):
|
||||||
""""A Booster of of XGBoost.
|
""""A Booster of of XGBoost.
|
||||||
@ -487,7 +561,8 @@ class Booster(object):
|
|||||||
for d in cache:
|
for d in cache:
|
||||||
if not isinstance(d, DMatrix):
|
if not isinstance(d, DMatrix):
|
||||||
raise TypeError('invalid cache item: {}'.format(type(d).__name__))
|
raise TypeError('invalid cache item: {}'.format(type(d).__name__))
|
||||||
self._validate_feature_names(d)
|
self._validate_features(d)
|
||||||
|
|
||||||
dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
|
dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
|
||||||
self.handle = ctypes.c_void_p()
|
self.handle = ctypes.c_void_p()
|
||||||
_check_call(_LIB.XGBoosterCreate(dmats, len(cache), ctypes.byref(self.handle)))
|
_check_call(_LIB.XGBoosterCreate(dmats, len(cache), ctypes.byref(self.handle)))
|
||||||
@ -572,7 +647,7 @@ class Booster(object):
|
|||||||
"""
|
"""
|
||||||
if not isinstance(dtrain, DMatrix):
|
if not isinstance(dtrain, DMatrix):
|
||||||
raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
|
raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
|
||||||
self._validate_feature_names(dtrain)
|
self._validate_features(dtrain)
|
||||||
|
|
||||||
if fobj is None:
|
if fobj is None:
|
||||||
_check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
|
_check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
|
||||||
@ -598,7 +673,7 @@ class Booster(object):
|
|||||||
raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
|
raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
|
||||||
if not isinstance(dtrain, DMatrix):
|
if not isinstance(dtrain, DMatrix):
|
||||||
raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
|
raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
|
||||||
self._validate_feature_names(dtrain)
|
self._validate_features(dtrain)
|
||||||
|
|
||||||
_check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle,
|
_check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle,
|
||||||
c_array(ctypes.c_float, grad),
|
c_array(ctypes.c_float, grad),
|
||||||
@ -629,7 +704,7 @@ class Booster(object):
|
|||||||
raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__))
|
raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__))
|
||||||
if not isinstance(d[1], STRING_TYPES):
|
if not isinstance(d[1], STRING_TYPES):
|
||||||
raise TypeError('expected string, got {}'.format(type(d[1]).__name__))
|
raise TypeError('expected string, got {}'.format(type(d[1]).__name__))
|
||||||
self._validate_feature_names(d[0])
|
self._validate_features(d[0])
|
||||||
|
|
||||||
dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
|
dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
|
||||||
evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
|
evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
|
||||||
@ -664,7 +739,7 @@ class Booster(object):
|
|||||||
result: str
|
result: str
|
||||||
Evaluation result string.
|
Evaluation result string.
|
||||||
"""
|
"""
|
||||||
self._validate_feature_names(data)
|
self._validate_features(data)
|
||||||
return self.eval_set([(data, name)], iteration)
|
return self.eval_set([(data, name)], iteration)
|
||||||
|
|
||||||
def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
|
def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
|
||||||
@ -703,7 +778,7 @@ class Booster(object):
|
|||||||
if pred_leaf:
|
if pred_leaf:
|
||||||
option_mask |= 0x02
|
option_mask |= 0x02
|
||||||
|
|
||||||
self._validate_feature_names(data)
|
self._validate_features(data)
|
||||||
|
|
||||||
length = ctypes.c_ulong()
|
length = ctypes.c_ulong()
|
||||||
preds = ctypes.POINTER(ctypes.c_float)()
|
preds = ctypes.POINTER(ctypes.c_float)()
|
||||||
@ -805,9 +880,12 @@ class Booster(object):
|
|||||||
|
|
||||||
fname = from_pystr_to_cstr(self.feature_names)
|
fname = from_pystr_to_cstr(self.feature_names)
|
||||||
|
|
||||||
# supports quantitative type only
|
if self.feature_types is None:
|
||||||
# {'q': quantitative, 'i': indicator}
|
# use quantitative as default
|
||||||
ftype = from_pystr_to_cstr(['q'] * flen)
|
# {'q': quantitative, 'i': indicator}
|
||||||
|
ftype = from_pystr_to_cstr(['q'] * flen)
|
||||||
|
else:
|
||||||
|
ftype = from_pystr_to_cstr(self.feature_types)
|
||||||
_check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle,
|
_check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle,
|
||||||
flen,
|
flen,
|
||||||
fname,
|
fname,
|
||||||
@ -849,12 +927,14 @@ class Booster(object):
|
|||||||
fmap[fid] += 1
|
fmap[fid] += 1
|
||||||
return fmap
|
return fmap
|
||||||
|
|
||||||
def _validate_feature_names(self, data):
|
def _validate_features(self, data):
|
||||||
"""
|
"""
|
||||||
Validate Booster and data's feature_names are identical
|
Validate Booster and data's feature_names are identical.
|
||||||
|
Set feature_names and feature_types from DMatrix
|
||||||
"""
|
"""
|
||||||
if self.feature_names is None:
|
if self.feature_names is None:
|
||||||
self.feature_names = data.feature_names
|
self.feature_names = data.feature_names
|
||||||
|
self.feature_types = data.feature_types
|
||||||
else:
|
else:
|
||||||
# Booster can't accept data with different feature names
|
# Booster can't accept data with different feature names
|
||||||
if self.feature_names != data.feature_names:
|
if self.feature_names != data.feature_names:
|
||||||
|
|||||||
@ -92,7 +92,7 @@ def plot_importance(booster, ax=None, height=0.2,
|
|||||||
_NODEPAT = re.compile(r'(\d+):\[(.+)\]')
|
_NODEPAT = re.compile(r'(\d+):\[(.+)\]')
|
||||||
_LEAFPAT = re.compile(r'(\d+):(leaf=.+)')
|
_LEAFPAT = re.compile(r'(\d+):(leaf=.+)')
|
||||||
_EDGEPAT = re.compile(r'yes=(\d+),no=(\d+),missing=(\d+)')
|
_EDGEPAT = re.compile(r'yes=(\d+),no=(\d+),missing=(\d+)')
|
||||||
|
_EDGEPAT2 = re.compile(r'yes=(\d+),no=(\d+)')
|
||||||
|
|
||||||
def _parse_node(graph, text):
|
def _parse_node(graph, text):
|
||||||
"""parse dumped node"""
|
"""parse dumped node"""
|
||||||
@ -111,15 +111,24 @@ def _parse_node(graph, text):
|
|||||||
|
|
||||||
def _parse_edge(graph, node, text, yes_color='#0000FF', no_color='#FF0000'):
|
def _parse_edge(graph, node, text, yes_color='#0000FF', no_color='#FF0000'):
|
||||||
"""parse dumped edge"""
|
"""parse dumped edge"""
|
||||||
match = _EDGEPAT.match(text)
|
try:
|
||||||
|
match = _EDGEPAT.match(text)
|
||||||
|
if match is not None:
|
||||||
|
yes, no, missing = match.groups()
|
||||||
|
if yes == missing:
|
||||||
|
graph.edge(node, yes, label='yes, missing', color=yes_color)
|
||||||
|
graph.edge(node, no, label='no', color=no_color)
|
||||||
|
else:
|
||||||
|
graph.edge(node, yes, label='yes', color=yes_color)
|
||||||
|
graph.edge(node, no, label='no, missing', color=no_color)
|
||||||
|
return
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
match = _EDGEPAT2.match(text)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
yes, no, missing = match.groups()
|
yes, no = match.groups()
|
||||||
if yes == missing:
|
graph.edge(node, yes, label='yes', color=yes_color)
|
||||||
graph.edge(node, yes, label='yes, missing', color=yes_color)
|
graph.edge(node, no, label='no', color=no_color)
|
||||||
graph.edge(node, no, label='no', color=no_color)
|
|
||||||
else:
|
|
||||||
graph.edge(node, yes, label='yes', color=yes_color)
|
|
||||||
graph.edge(node, no, label='no, missing', color=no_color)
|
|
||||||
return
|
return
|
||||||
raise ValueError('Unable to parse edge: {0}'.format(text))
|
raise ValueError('Unable to parse edge: {0}'.format(text))
|
||||||
|
|
||||||
|
|||||||
@ -47,6 +47,25 @@ class TestBasic(unittest.TestCase):
|
|||||||
self.assertRaises(ValueError, xgb.DMatrix, data,
|
self.assertRaises(ValueError, xgb.DMatrix, data,
|
||||||
feature_names=['a', 'b', 'c', 'd', 'e=1'])
|
feature_names=['a', 'b', 'c', 'd', 'e=1'])
|
||||||
|
|
||||||
|
dm = xgb.DMatrix(data)
|
||||||
|
dm.feature_names = list('abcde')
|
||||||
|
assert dm.feature_names == list('abcde')
|
||||||
|
|
||||||
|
dm.feature_types = 'q'
|
||||||
|
assert dm.feature_types == list('qqqqq')
|
||||||
|
|
||||||
|
dm.feature_types = list('qiqiq')
|
||||||
|
assert dm.feature_types == list('qiqiq')
|
||||||
|
|
||||||
|
def incorrect_type_set():
|
||||||
|
dm.feature_types = list('abcde')
|
||||||
|
self.assertRaises(ValueError, incorrect_type_set)
|
||||||
|
|
||||||
|
# reset
|
||||||
|
dm.feature_names = None
|
||||||
|
assert dm.feature_names is None
|
||||||
|
assert dm.feature_types is None
|
||||||
|
|
||||||
def test_feature_names(self):
|
def test_feature_names(self):
|
||||||
data = np.random.randn(100, 5)
|
data = np.random.randn(100, 5)
|
||||||
target = np.array([0, 1] * 50)
|
target = np.array([0, 1] * 50)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user