Simplify the data backends. (#5893)

This commit is contained in:
Jiaming Yuan
2020-07-16 15:17:31 +08:00
committed by GitHub
parent 7aee0e51ed
commit 029a8b533f
8 changed files with 790 additions and 806 deletions

View File

@@ -1,17 +1,14 @@
# -*- coding: utf-8 -*-
import sys
from contextlib import contextmanager
try:
# python 2
from StringIO import StringIO
except ImportError:
# python 3
from io import StringIO
from io import StringIO
import numpy as np
import os
import xgboost as xgb
import unittest
import json
from pathlib import Path
import tempfile
dpath = 'demo/data/'
rng = np.random.RandomState(1994)
@@ -66,16 +63,19 @@ class TestBasic(unittest.TestCase):
# error must be smaller than 10%
assert err < 0.1
# save dmatrix into binary buffer
dtest.save_binary('dtest.buffer')
# save model
bst.save_model('xgb.model')
# load model and data in
bst2 = xgb.Booster(model_file='xgb.model')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0
with tempfile.TemporaryDirectory() as tmpdir:
dtest_path = os.path.join(tmpdir, 'dtest.dmatrix')
# save dmatrix into binary buffer
dtest.save_binary(dtest_path)
# save model
model_path = os.path.join(tmpdir, 'model.booster')
bst.save_model(model_path)
# load model and data in
bst2 = xgb.Booster(model_file=model_path)
dtest2 = xgb.DMatrix(dtest_path)
preds2 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0
def test_record_results(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')

View File

@@ -67,8 +67,7 @@ class TestPandas(unittest.TestCase):
# 0 1 1 0 0
# 1 2 0 1 0
# 2 3 0 0 1
pandas_handler = xgb.data.PandasHandler(np.nan, 0, False)
result, _, _ = pandas_handler._maybe_pandas_data(dummies, None, None)
result, _, _ = xgb.data._transform_pandas_df(dummies)
exp = np.array([[1., 1., 0., 0.],
[2., 0., 1., 0.],
[3., 0., 0., 1.]])
@@ -129,18 +128,17 @@ class TestPandas(unittest.TestCase):
def test_pandas_label(self):
# label must be a single column
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
pandas_handler = xgb.data.PandasHandler(np.nan, 0, False)
self.assertRaises(ValueError, pandas_handler._maybe_pandas_data, df,
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
None, None, 'label', 'float')
# label must be supported dtype
df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
self.assertRaises(ValueError, pandas_handler._maybe_pandas_data, df,
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
None, None, 'label', 'float')
df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
result, _, _ = pandas_handler._maybe_pandas_data(df, None, None,
'label', 'float')
result, _, _ = xgb.data._transform_pandas_df(df, None, None,
'label', 'float')
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
dtype=float))
dm = xgb.DMatrix(np.random.randn(3, 2), label=df)