Merge pull request #623 from sinhrks/pandas_label
Cleanup pandas support
This commit is contained in:
commit
cb5171914e
@ -1,6 +1,8 @@
|
||||
# pylint: disable=invalid-name, exec-used
|
||||
"""Convert README.md to README.rst for PyPI"""
|
||||
|
||||
from pypandoc import convert
|
||||
|
||||
read_md = convert('python-package/README.md', 'rst')
|
||||
with open('python-package/README.rst', 'w') as rst_file:
|
||||
rst_file.write(read_md)
|
||||
|
||||
47
python-package/xgboost/compat.py
Normal file
47
python-package/xgboost/compat.py
Normal file
@ -0,0 +1,47 @@
|
||||
# coding: utf-8
|
||||
# pylint: disable=unused-import, invalid-name
|
||||
"""For compatibility"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
PY3 = (sys.version_info[0] == 3)
|
||||
|
||||
if PY3:
|
||||
# pylint: disable=invalid-name, redefined-builtin
|
||||
STRING_TYPES = str,
|
||||
else:
|
||||
# pylint: disable=invalid-name
|
||||
STRING_TYPES = basestring,
|
||||
|
||||
# pandas
|
||||
try:
|
||||
from pandas import DataFrame
|
||||
PANDAS_INSTALLED = True
|
||||
except ImportError:
|
||||
|
||||
class DataFrame(object):
|
||||
""" dummy for pandas.DataFrame """
|
||||
pass
|
||||
|
||||
PANDAS_INSTALLED = False
|
||||
|
||||
# sklearn
|
||||
try:
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import RegressorMixin, ClassifierMixin
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
SKLEARN_INSTALLED = True
|
||||
|
||||
XGBModelBase = BaseEstimator
|
||||
XGBRegressorBase = RegressorMixin
|
||||
XGBClassifierBase = ClassifierMixin
|
||||
except ImportError:
|
||||
SKLEARN_INSTALLED = False
|
||||
|
||||
# used for compatiblity without sklearn
|
||||
XGBModelBase = object
|
||||
XGBClassifierBase = object
|
||||
XGBRegressorBase = object
|
||||
@ -4,7 +4,6 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import os
|
||||
import sys
|
||||
import ctypes
|
||||
import collections
|
||||
|
||||
@ -13,20 +12,12 @@ import scipy.sparse
|
||||
|
||||
from .libpath import find_lib_path
|
||||
|
||||
from .compat import STRING_TYPES, PY3, DataFrame
|
||||
|
||||
class XGBoostError(Exception):
|
||||
"""Error throwed by xgboost trainer."""
|
||||
pass
|
||||
|
||||
PY3 = (sys.version_info[0] == 3)
|
||||
|
||||
if PY3:
|
||||
# pylint: disable=invalid-name, redefined-builtin
|
||||
STRING_TYPES = str,
|
||||
else:
|
||||
# pylint: disable=invalid-name
|
||||
STRING_TYPES = basestring,
|
||||
|
||||
|
||||
def from_pystr_to_cstr(data):
|
||||
"""Convert a list of Python str to C pointer
|
||||
@ -138,42 +129,49 @@ def c_array(ctype, values):
|
||||
return (ctype * len(values))(*values)
|
||||
|
||||
|
||||
def _maybe_from_pandas(data, label, feature_names, feature_types):
|
||||
""" Extract internal data from pd.DataFrame """
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
return data, label, feature_names, feature_types
|
||||
|
||||
if not isinstance(data, pd.DataFrame):
|
||||
return data, label, feature_names, feature_types
|
||||
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
|
||||
'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
|
||||
'float16': 'float', 'float32': 'float', 'float64': 'float',
|
||||
'bool': 'i'}
|
||||
|
||||
mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
|
||||
'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
|
||||
'float16': 'float', 'float32': 'float', 'float64': 'float',
|
||||
'bool': 'i'}
|
||||
|
||||
def _maybe_pandas_data(data, feature_names, feature_types):
|
||||
""" Extract internal data from pd.DataFrame for DMatrix data """
|
||||
|
||||
if not isinstance(data, DataFrame):
|
||||
return data, feature_names, feature_types
|
||||
|
||||
data_dtypes = data.dtypes
|
||||
if not all(dtype.name in (mapper.keys()) for dtype in data_dtypes):
|
||||
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
|
||||
raise ValueError('DataFrame.dtypes for data must be int, float or bool')
|
||||
|
||||
if label is not None:
|
||||
if isinstance(label, pd.DataFrame):
|
||||
label_dtypes = label.dtypes
|
||||
if not all(dtype.name in (mapper.keys()) for dtype in label_dtypes):
|
||||
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
|
||||
else:
|
||||
label = label.values.astype('float')
|
||||
|
||||
if feature_names is None:
|
||||
feature_names = data.columns.format()
|
||||
|
||||
if feature_types is None:
|
||||
feature_types = [mapper[dtype.name] for dtype in data_dtypes]
|
||||
feature_types = [PANDAS_DTYPE_MAPPER[dtype.name] for dtype in data_dtypes]
|
||||
|
||||
data = data.values.astype('float')
|
||||
|
||||
return data, label, feature_names, feature_types
|
||||
return data, feature_names, feature_types
|
||||
|
||||
|
||||
def _maybe_pandas_label(label):
|
||||
""" Extract internal data from pd.DataFrame for DMatrix label """
|
||||
|
||||
if isinstance(label, DataFrame):
|
||||
if len(label.columns) > 1:
|
||||
raise ValueError('DataFrame for label cannot have multiple columns')
|
||||
|
||||
label_dtypes = label.dtypes
|
||||
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
|
||||
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
|
||||
else:
|
||||
label = label.values.astype('float')
|
||||
# pd.Series can be passed to xgb as it is
|
||||
|
||||
return label
|
||||
|
||||
class DMatrix(object):
|
||||
"""Data Matrix used in XGBoost.
|
||||
@ -216,13 +214,10 @@ class DMatrix(object):
|
||||
self.handle = None
|
||||
return
|
||||
|
||||
klass = getattr(getattr(data, '__class__', None), '__name__', None)
|
||||
if klass == 'DataFrame':
|
||||
# once check class name to avoid unnecessary pandas import
|
||||
data, label, feature_names, feature_types = _maybe_from_pandas(data,
|
||||
label,
|
||||
feature_names,
|
||||
feature_types)
|
||||
data, feature_names, feature_types = _maybe_pandas_data(data,
|
||||
feature_names,
|
||||
feature_types)
|
||||
label = _maybe_pandas_label(label)
|
||||
|
||||
if isinstance(data, STRING_TYPES):
|
||||
self.handle = ctypes.c_void_p()
|
||||
|
||||
@ -7,23 +7,9 @@ import numpy as np
|
||||
from .core import Booster, DMatrix, XGBoostError
|
||||
from .training import train
|
||||
|
||||
try:
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import RegressorMixin, ClassifierMixin
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
SKLEARN_INSTALLED = True
|
||||
except ImportError:
|
||||
SKLEARN_INSTALLED = False
|
||||
from .compat import (SKLEARN_INSTALLED, XGBModelBase,
|
||||
XGBClassifierBase, XGBRegressorBase, LabelEncoder)
|
||||
|
||||
# used for compatiblity without sklearn
|
||||
XGBModelBase = object
|
||||
XGBClassifierBase = object
|
||||
XGBRegressorBase = object
|
||||
|
||||
if SKLEARN_INSTALLED:
|
||||
XGBModelBase = BaseEstimator
|
||||
XGBRegressorBase = RegressorMixin
|
||||
XGBClassifierBase = ClassifierMixin
|
||||
|
||||
class XGBModel(XGBModelBase):
|
||||
# pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name
|
||||
|
||||
@ -136,6 +136,51 @@ class TestBasic(unittest.TestCase):
|
||||
assert dm.num_row() == 2
|
||||
assert dm.num_col() == 3
|
||||
|
||||
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
||||
dummies = pd.get_dummies(df)
|
||||
# B A_X A_Y A_Z
|
||||
# 0 1 1 0 0
|
||||
# 1 2 0 1 0
|
||||
# 2 3 0 0 1
|
||||
result, _, _ = xgb.core._maybe_pandas_data(dummies, None, None)
|
||||
exp = np.array([[ 1., 1., 0., 0.],
|
||||
[ 2., 0., 1., 0.],
|
||||
[ 3., 0., 0., 1.]])
|
||||
np.testing.assert_array_equal(result, exp)
|
||||
|
||||
dm = xgb.DMatrix(dummies)
|
||||
assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z']
|
||||
assert dm.feature_types == ['int', 'float', 'float', 'float']
|
||||
assert dm.num_row() == 3
|
||||
assert dm.num_col() == 4
|
||||
|
||||
df = pd.DataFrame({'A=1': [1, 2, 3], 'A=2': [4, 5, 6]})
|
||||
dm = xgb.DMatrix(df)
|
||||
assert dm.feature_names == ['A=1', 'A=2']
|
||||
assert dm.feature_types == ['int', 'int']
|
||||
assert dm.num_row() == 3
|
||||
assert dm.num_col() == 2
|
||||
|
||||
def test_pandas_label(self):
|
||||
import pandas as pd
|
||||
|
||||
# label must be a single column
|
||||
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
||||
self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df)
|
||||
|
||||
# label must be supported dtype
|
||||
df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
|
||||
self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df)
|
||||
|
||||
df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
|
||||
result = xgb.core._maybe_pandas_label(df)
|
||||
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]], dtype=float))
|
||||
|
||||
dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
|
||||
assert dm.num_row() == 3
|
||||
assert dm.num_col() == 2
|
||||
|
||||
|
||||
def test_load_file_invalid(self):
|
||||
|
||||
self.assertRaises(ValueError, xgb.Booster,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user