From b943becc618819ac2e866b11168b79be48a8a420 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 1 Oct 2015 22:39:56 +0900 Subject: [PATCH] python DMatrix now accepts pandas DataFrame --- python-package/xgboost/core.py | 31 ++++++++++++++++++++++++++++++- scripts/travis_script.sh | 2 +- tests/python/test_basic.py | 21 +++++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index aaddc43fb..8c2567820 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -138,6 +138,28 @@ def c_array(ctype, values): return (ctype * len(values))(*values) +def _maybe_from_pandas(data, feature_names, feature_types): + """ Extract internal data from pd.DataFrame """ + try: + import pandas as pd + except ImportError: + return data, feature_names, feature_types + + if not isinstance(data, pd.DataFrame): + return data, feature_names, feature_types + + dtypes = data.dtypes + if not all(dtype.name in ('int64', 'float64', 'bool') for dtype in dtypes): + raise ValueError('DataFrame.dtypes must be int, float or bool') + + if feature_names is None: + feature_names = data.columns.tolist() + if feature_types is None: + mapper = {'int64': 'int', 'float64': 'q', 'bool': 'i'} + feature_types = [mapper[dtype.name] for dtype in dtypes] + data = data.values.astype('float') + return data, feature_names, feature_types + class DMatrix(object): """Data Matrix used in XGBoost. @@ -157,7 +179,7 @@ class DMatrix(object): Parameters ---------- - data : string/numpy array/scipy.sparse + data : string/numpy array/scipy.sparse/pd.DataFrame Data source of DMatrix. When data is string type, it represents the path libsvm format txt file, or binary file that xgboost can read from. @@ -178,6 +200,13 @@ class DMatrix(object): if data is None: self.handle = None return + + klass = getattr(getattr(data, '__class__', None), '__name__', None) + if klass == 'DataFrame': + # once check class name to avoid unnecessary pandas import + data, feature_names, feature_types = _maybe_from_pandas(data, feature_names, + feature_types) + if isinstance(data, STRING_TYPES): self.handle = ctypes.c_void_p() _check_call(_LIB.XGDMatrixCreateFromFile(c_str(data), diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index f633f9d7b..3a026966d 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -64,7 +64,7 @@ if [ ${TASK} == "python-package" -o ${TASK} == "python-package3" ]; then conda create -n myenv python=2.7 fi source activate myenv - conda install numpy scipy matplotlib nose + conda install numpy scipy pandas matplotlib nose python -m pip install graphviz make all CXX=${CXX} || exit -1 diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index afbc53e1e..9778d8338 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -97,6 +97,27 @@ class TestBasic(unittest.TestCase): dm = xgb.DMatrix(dummy, feature_names=list('abcde')) self.assertRaises(ValueError, bst.predict, dm) + def test_pandas(self): + import pandas as pd + df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c']) + dm = xgb.DMatrix(df, label=pd.Series([1, 2])) + assert dm.feature_names == ['a', 'b', 'c'] + assert dm.feature_types == ['int', 'q', 'i'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + + # overwrite feature_names and feature_types + dm = xgb.DMatrix(df, label=pd.Series([1, 2]), + feature_names=['x', 'y', 'z'], feature_types=['q', 'q', 'q']) + assert dm.feature_names == ['x', 'y', 'z'] + assert dm.feature_types == ['q', 'q', 'q'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + + # incorrect dtypes + df = pd.DataFrame([[1, 2., 'x'], [2, 3., 'y']], columns=['a', 'b', 'c']) + self.assertRaises(ValueError, xgb.DMatrix, df) + def test_load_file_invalid(self): self.assertRaises(ValueError, xgb.Booster,