Enable flake8
This commit is contained in:
parent
b3c9e6a0db
commit
8fc2456c87
@ -4,7 +4,7 @@ from __future__ import absolute_import
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
#import subprocess
|
# import subprocess
|
||||||
sys.path.insert(0, '.')
|
sys.path.insert(0, '.')
|
||||||
|
|
||||||
CURRENT_DIR = os.path.dirname(__file__)
|
CURRENT_DIR = os.path.dirname(__file__)
|
||||||
@ -18,12 +18,12 @@ exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpat
|
|||||||
|
|
||||||
LIB_PATH = libpath['find_lib_path']()
|
LIB_PATH = libpath['find_lib_path']()
|
||||||
print("Install libxgboost from: %s" % LIB_PATH)
|
print("Install libxgboost from: %s" % LIB_PATH)
|
||||||
#Please use setup_pip.py for generating and deploying pip installation
|
# Please use setup_pip.py for generating and deploying pip installation
|
||||||
#detailed instruction in setup_pip.py
|
# detailed instruction in setup_pip.py
|
||||||
setup(name='xgboost',
|
setup(name='xgboost',
|
||||||
version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(),
|
version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(),
|
||||||
#version='0.4a23',
|
# version='0.4a23',
|
||||||
description = "XGBoost Python Package",
|
description="XGBoost Python Package",
|
||||||
long_description=open(os.path.join(CURRENT_DIR, 'README.rst')).read(),
|
long_description=open(os.path.join(CURRENT_DIR, 'README.rst')).read(),
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'numpy',
|
'numpy',
|
||||||
@ -33,8 +33,8 @@ setup(name='xgboost',
|
|||||||
maintainer_email='phunter.lau@gmail.com',
|
maintainer_email='phunter.lau@gmail.com',
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
#this will use MANIFEST.in during install where we specify additional files,
|
# this will use MANIFEST.in during install where we specify additional files,
|
||||||
#this is the golden line
|
# this is the golden line
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
data_files=[('xgboost', LIB_PATH)],
|
data_files=[('xgboost', LIB_PATH)],
|
||||||
url='https://github.com/dmlc/xgboost')
|
url='https://github.com/dmlc/xgboost')
|
||||||
|
|||||||
@ -4,14 +4,14 @@ from __future__ import absolute_import
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
#import subprocess
|
# import subprocess
|
||||||
sys.path.insert(0, '.')
|
sys.path.insert(0, '.')
|
||||||
|
|
||||||
#this script is for packing and shipping pip installation
|
# this script is for packing and shipping pip installation
|
||||||
#it builds xgboost code on the fly and packs for pip
|
# it builds xgboost code on the fly and packs for pip
|
||||||
#please don't use this file for installing from github
|
# please don't use this file for installing from github
|
||||||
|
|
||||||
if os.name != 'nt': #if not windows, compile and install
|
if os.name != 'nt': # if not windows, compile and install
|
||||||
os.system('sh ./xgboost/build-python.sh')
|
os.system('sh ./xgboost/build-python.sh')
|
||||||
else:
|
else:
|
||||||
print('Windows users please use github installation.')
|
print('Windows users please use github installation.')
|
||||||
@ -28,12 +28,12 @@ exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpat
|
|||||||
|
|
||||||
LIB_PATH = libpath['find_lib_path']()
|
LIB_PATH = libpath['find_lib_path']()
|
||||||
|
|
||||||
#to deploy to pip, please use
|
# to deploy to pip, please use
|
||||||
#make pythonpack
|
# make pythonpack
|
||||||
#python setup.py register sdist upload
|
# python setup.py register sdist upload
|
||||||
#and be sure to test it firstly using "python setup.py register sdist upload -r pypitest"
|
# and be sure to test it firstly using "python setup.py register sdist upload -r pypitest"
|
||||||
setup(name='xgboost',
|
setup(name='xgboost',
|
||||||
#version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(),
|
# version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(),
|
||||||
version='0.4a30',
|
version='0.4a30',
|
||||||
description=open(os.path.join(CURRENT_DIR, 'README.rst')).read(),
|
description=open(os.path.join(CURRENT_DIR, 'README.rst')).read(),
|
||||||
install_requires=[
|
install_requires=[
|
||||||
@ -44,15 +44,15 @@ setup(name='xgboost',
|
|||||||
maintainer_email='phunter.lau@gmail.com',
|
maintainer_email='phunter.lau@gmail.com',
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
#don't need this and don't use this, give everything to MANIFEST.in
|
# don't need this and don't use this, give everything to MANIFEST.in
|
||||||
#package_dir = {'':'xgboost'},
|
# package_dir = {'':'xgboost'},
|
||||||
#package_data = {'': ['*.txt','*.md','*.sh'],
|
# package_data = {'': ['*.txt','*.md','*.sh'],
|
||||||
# }
|
# }
|
||||||
#this will use MANIFEST.in during install where we specify additional files,
|
# this will use MANIFEST.in during install where we specify additional files,
|
||||||
#this is the golden line
|
# this is the golden line
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
#!!! don't use data_files for creating pip installation,
|
# !!! don't use data_files for creating pip installation,
|
||||||
#otherwise install_data process will copy it to
|
# otherwise install_data process will copy it to
|
||||||
#root directory for some machines, and cause confusions on building
|
# root directory for some machines, and cause confusions on building
|
||||||
#data_files=[('xgboost', LIB_PATH)],
|
# data_files=[('xgboost', LIB_PATH)],
|
||||||
url='https://github.com/dmlc/xgboost')
|
url='https://github.com/dmlc/xgboost')
|
||||||
|
|||||||
@ -10,7 +10,7 @@ import os
|
|||||||
|
|
||||||
from .core import DMatrix, Booster
|
from .core import DMatrix, Booster
|
||||||
from .training import train, cv
|
from .training import train, cv
|
||||||
from . import rabit
|
from . import rabit # noqa
|
||||||
try:
|
try:
|
||||||
from .sklearn import XGBModel, XGBClassifier, XGBRegressor
|
from .sklearn import XGBModel, XGBClassifier, XGBRegressor
|
||||||
from .plotting import plot_importance, plot_tree, to_graphviz
|
from .plotting import plot_importance, plot_tree, to_graphviz
|
||||||
|
|||||||
@ -12,11 +12,21 @@ PY3 = (sys.version_info[0] == 3)
|
|||||||
if PY3:
|
if PY3:
|
||||||
# pylint: disable=invalid-name, redefined-builtin
|
# pylint: disable=invalid-name, redefined-builtin
|
||||||
STRING_TYPES = str,
|
STRING_TYPES = str,
|
||||||
py_str = lambda x: x.decode('utf-8')
|
|
||||||
|
def py_str(x):
|
||||||
|
return x.decode('utf-8')
|
||||||
else:
|
else:
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
STRING_TYPES = basestring,
|
STRING_TYPES = basestring,
|
||||||
py_str = lambda x: x
|
|
||||||
|
def py_str(x):
|
||||||
|
return x
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cPickle as pickle # noqa
|
||||||
|
except ImportError:
|
||||||
|
import pickle # noqa
|
||||||
|
|
||||||
|
|
||||||
# pandas
|
# pandas
|
||||||
try:
|
try:
|
||||||
@ -34,7 +44,7 @@ except ImportError:
|
|||||||
try:
|
try:
|
||||||
from sklearn.base import BaseEstimator
|
from sklearn.base import BaseEstimator
|
||||||
from sklearn.base import RegressorMixin, ClassifierMixin
|
from sklearn.base import RegressorMixin, ClassifierMixin
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder # noqa
|
||||||
from sklearn.cross_validation import KFold, StratifiedKFold
|
from sklearn.cross_validation import KFold, StratifiedKFold
|
||||||
SKLEARN_INSTALLED = True
|
SKLEARN_INSTALLED = True
|
||||||
|
|
||||||
|
|||||||
@ -14,6 +14,7 @@ from .libpath import find_lib_path
|
|||||||
|
|
||||||
from .compat import STRING_TYPES, PY3, DataFrame, py_str
|
from .compat import STRING_TYPES, PY3, DataFrame, py_str
|
||||||
|
|
||||||
|
|
||||||
class XGBoostError(Exception):
|
class XGBoostError(Exception):
|
||||||
"""Error throwed by xgboost trainer."""
|
"""Error throwed by xgboost trainer."""
|
||||||
pass
|
pass
|
||||||
@ -82,6 +83,7 @@ def _load_lib():
|
|||||||
# load the XGBoost library globally
|
# load the XGBoost library globally
|
||||||
_LIB = _load_lib()
|
_LIB = _load_lib()
|
||||||
|
|
||||||
|
|
||||||
def _check_call(ret):
|
def _check_call(ret):
|
||||||
"""Check the return value of C API call
|
"""Check the return value of C API call
|
||||||
|
|
||||||
@ -129,7 +131,6 @@ def c_array(ctype, values):
|
|||||||
return (ctype * len(values))(*values)
|
return (ctype * len(values))(*values)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
|
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
|
||||||
'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
|
'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
|
||||||
'float16': 'float', 'float32': 'float', 'float64': 'float',
|
'float16': 'float', 'float32': 'float', 'float64': 'float',
|
||||||
@ -144,8 +145,12 @@ def _maybe_pandas_data(data, feature_names, feature_types):
|
|||||||
|
|
||||||
data_dtypes = data.dtypes
|
data_dtypes = data.dtypes
|
||||||
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
|
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
|
||||||
bad_fields = [data.columns[i] for i, dtype in enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER ]
|
bad_fields = [data.columns[i] for i, dtype in
|
||||||
raise ValueError('DataFrame.dtypes for data must be int, float or bool.\nDid not expect the data types in fie lds '+', '.join(bad_fields))
|
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
|
||||||
|
|
||||||
|
msg = """DataFrame.dtypes for data must be int, float or bool.
|
||||||
|
Did not expect the data types in fields """
|
||||||
|
raise ValueError(msg + ', '.join(bad_fields))
|
||||||
|
|
||||||
if feature_names is None:
|
if feature_names is None:
|
||||||
feature_names = data.columns.format()
|
feature_names = data.columns.format()
|
||||||
@ -174,6 +179,7 @@ def _maybe_pandas_label(label):
|
|||||||
|
|
||||||
return label
|
return label
|
||||||
|
|
||||||
|
|
||||||
class DMatrix(object):
|
class DMatrix(object):
|
||||||
"""Data Matrix used in XGBoost.
|
"""Data Matrix used in XGBoost.
|
||||||
|
|
||||||
@ -1041,8 +1047,14 @@ class Booster(object):
|
|||||||
if self.feature_names != data.feature_names:
|
if self.feature_names != data.feature_names:
|
||||||
dat_missing = set(self.feature_names) - set(data.feature_names)
|
dat_missing = set(self.feature_names) - set(data.feature_names)
|
||||||
my_missing = set(data.feature_names) - set(self.feature_names)
|
my_missing = set(data.feature_names) - set(self.feature_names)
|
||||||
|
|
||||||
msg = 'feature_names mismatch: {0} {1}'
|
msg = 'feature_names mismatch: {0} {1}'
|
||||||
if dat_missing: msg +='\nexpected ' + ', '.join(str(s) for s in dat_missing) +' in input data'
|
|
||||||
if my_missing: msg +='\ntraining data did not have the following fields: ' + ', '.join(str(s) for s in my_missing)
|
if dat_missing:
|
||||||
|
msg += '\nexpected ' + ', '.join(str(s) for s in dat_missing) + ' in input data'
|
||||||
|
|
||||||
|
if my_missing:
|
||||||
|
msg += '\ntraining data did not have the following fields: ' + ', '.join(str(s) for s in my_missing)
|
||||||
|
|
||||||
raise ValueError(msg.format(self.feature_names,
|
raise ValueError(msg.format(self.feature_names,
|
||||||
data.feature_names))
|
data.feature_names))
|
||||||
|
|||||||
@ -36,7 +36,8 @@ def find_lib_path():
|
|||||||
else:
|
else:
|
||||||
dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path]
|
dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path]
|
||||||
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
|
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
|
||||||
#From github issues, most of installation errors come from machines w/o compilers
|
|
||||||
|
# From github issues, most of installation errors come from machines w/o compilers
|
||||||
if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
|
if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
|
||||||
raise XGBoostLibraryNotFound(
|
raise XGBoostLibraryNotFound(
|
||||||
'Cannot find XGBoost Libarary in the candicate path, ' +
|
'Cannot find XGBoost Libarary in the candicate path, ' +
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import numpy as np
|
|||||||
from .core import Booster
|
from .core import Booster
|
||||||
from .sklearn import XGBModel
|
from .sklearn import XGBModel
|
||||||
|
|
||||||
|
|
||||||
def plot_importance(booster, ax=None, height=0.2,
|
def plot_importance(booster, ax=None, height=0.2,
|
||||||
xlim=None, ylim=None, title='Feature importance',
|
xlim=None, ylim=None, title='Feature importance',
|
||||||
xlabel='F score', ylabel='Features',
|
xlabel='F score', ylabel='Features',
|
||||||
@ -105,6 +106,7 @@ _LEAFPAT = re.compile(r'(\d+):(leaf=.+)')
|
|||||||
_EDGEPAT = re.compile(r'yes=(\d+),no=(\d+),missing=(\d+)')
|
_EDGEPAT = re.compile(r'yes=(\d+),no=(\d+),missing=(\d+)')
|
||||||
_EDGEPAT2 = re.compile(r'yes=(\d+),no=(\d+)')
|
_EDGEPAT2 = re.compile(r'yes=(\d+),no=(\d+)')
|
||||||
|
|
||||||
|
|
||||||
def _parse_node(graph, text):
|
def _parse_node(graph, text):
|
||||||
"""parse dumped node"""
|
"""parse dumped node"""
|
||||||
match = _NODEPAT.match(text)
|
match = _NODEPAT.match(text)
|
||||||
|
|||||||
@ -1,11 +1,12 @@
|
|||||||
"""Distributed XGBoost Rabit related API."""
|
"""Distributed XGBoost Rabit related API."""
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
import sys
|
import sys
|
||||||
import atexit
|
|
||||||
import ctypes
|
import ctypes
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .core import _LIB, c_str, STRING_TYPES
|
from .core import _LIB, c_str, STRING_TYPES
|
||||||
|
from .compat import pickle
|
||||||
|
|
||||||
|
|
||||||
def _init_rabit():
|
def _init_rabit():
|
||||||
"""internal libary initializer."""
|
"""internal libary initializer."""
|
||||||
@ -15,6 +16,7 @@ def _init_rabit():
|
|||||||
_LIB.RabitIsDistributed.restype = ctypes.c_int
|
_LIB.RabitIsDistributed.restype = ctypes.c_int
|
||||||
_LIB.RabitVersionNumber.restype = ctypes.c_int
|
_LIB.RabitVersionNumber.restype = ctypes.c_int
|
||||||
|
|
||||||
|
|
||||||
def init(args=None):
|
def init(args=None):
|
||||||
"""Initialize the rabit libary with arguments"""
|
"""Initialize the rabit libary with arguments"""
|
||||||
if args is None:
|
if args is None:
|
||||||
@ -73,6 +75,7 @@ def tracker_print(msg):
|
|||||||
sys.stdout.write(msg)
|
sys.stdout.write(msg)
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
|
||||||
def get_processor_name():
|
def get_processor_name():
|
||||||
"""Get the processor name.
|
"""Get the processor name.
|
||||||
|
|
||||||
@ -127,14 +130,14 @@ def broadcast(data, root):
|
|||||||
|
|
||||||
# enumeration of dtypes
|
# enumeration of dtypes
|
||||||
DTYPE_ENUM__ = {
|
DTYPE_ENUM__ = {
|
||||||
np.dtype('int8') : 0,
|
np.dtype('int8'): 0,
|
||||||
np.dtype('uint8') : 1,
|
np.dtype('uint8'): 1,
|
||||||
np.dtype('int32') : 2,
|
np.dtype('int32'): 2,
|
||||||
np.dtype('uint32') : 3,
|
np.dtype('uint32'): 3,
|
||||||
np.dtype('int64') : 4,
|
np.dtype('int64'): 4,
|
||||||
np.dtype('uint64') : 5,
|
np.dtype('uint64'): 5,
|
||||||
np.dtype('float32') : 6,
|
np.dtype('float32'): 6,
|
||||||
np.dtype('float64') : 7
|
np.dtype('float64'): 7
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -175,6 +178,7 @@ def allreduce(data, op, prepare_fun=None):
|
|||||||
op, None, None)
|
op, None, None)
|
||||||
else:
|
else:
|
||||||
func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
|
func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
|
||||||
|
|
||||||
def pfunc(args):
|
def pfunc(args):
|
||||||
"""prepare function."""
|
"""prepare function."""
|
||||||
prepare_fun(data)
|
prepare_fun(data)
|
||||||
|
|||||||
@ -366,7 +366,6 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
self.classes_ = np.unique(y)
|
self.classes_ = np.unique(y)
|
||||||
self.n_classes_ = len(self.classes_)
|
self.n_classes_ = len(self.classes_)
|
||||||
|
|
||||||
|
|
||||||
xgb_options = self.get_xgb_params()
|
xgb_options = self.get_xgb_params()
|
||||||
|
|
||||||
if callable(self.objective):
|
if callable(self.objective):
|
||||||
|
|||||||
@ -6,12 +6,12 @@ from __future__ import absolute_import
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
import os
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from .core import Booster, STRING_TYPES
|
from .core import Booster, STRING_TYPES, XGBoostError
|
||||||
from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold, XGBKFold)
|
from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold)
|
||||||
from . import rabit
|
from . import rabit
|
||||||
|
|
||||||
|
|
||||||
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
||||||
maximize=False, early_stopping_rounds=None, evals_result=None,
|
maximize=False, early_stopping_rounds=None, evals_result=None,
|
||||||
verbose_eval=True, learning_rates=None, xgb_model=None):
|
verbose_eval=True, learning_rates=None, xgb_model=None):
|
||||||
@ -97,7 +97,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|||||||
verbose_eval = True if verbose_eval_every_line > 0 else False
|
verbose_eval = True if verbose_eval_every_line > 0 else False
|
||||||
|
|
||||||
if rabit.get_rank() != 0:
|
if rabit.get_rank() != 0:
|
||||||
verbose_eval = False;
|
verbose_eval = False
|
||||||
|
|
||||||
if xgb_model is not None:
|
if xgb_model is not None:
|
||||||
if not isinstance(xgb_model, STRING_TYPES):
|
if not isinstance(xgb_model, STRING_TYPES):
|
||||||
@ -135,8 +135,9 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|||||||
if isinstance(params, list):
|
if isinstance(params, list):
|
||||||
if len(params) != len(dict(params).items()):
|
if len(params) != len(dict(params).items()):
|
||||||
params = dict(params)
|
params = dict(params)
|
||||||
rabit.tracker_print("Multiple eval metrics have been passed: " \
|
msg = ("Multiple eval metrics have been passed: "
|
||||||
"'{0}' will be used for early stopping.\n\n".format(params['eval_metric']))
|
"'{0}' will be used for early stopping.\n\n")
|
||||||
|
rabit.tracker_print(msg.format(params['eval_metric']))
|
||||||
else:
|
else:
|
||||||
params = dict(params)
|
params = dict(params)
|
||||||
|
|
||||||
@ -173,7 +174,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|||||||
|
|
||||||
# Distributed code: need to resume to this point.
|
# Distributed code: need to resume to this point.
|
||||||
# Skip the first update if it is a recovery step.
|
# Skip the first update if it is a recovery step.
|
||||||
if version % 2 == 0:
|
if version % 2 == 0:
|
||||||
bst.update(dtrain, i, obj)
|
bst.update(dtrain, i, obj)
|
||||||
bst.save_rabit_checkpoint()
|
bst.save_rabit_checkpoint()
|
||||||
version += 1
|
version += 1
|
||||||
@ -203,7 +204,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|||||||
evals_idx = evals_name.index(key)
|
evals_idx = evals_name.index(key)
|
||||||
res_per_eval = len(res) // len(evals_name)
|
res_per_eval = len(res) // len(evals_name)
|
||||||
for r in range(res_per_eval):
|
for r in range(res_per_eval):
|
||||||
res_item = res[(evals_idx*res_per_eval) + r]
|
res_item = res[(evals_idx * res_per_eval) + r]
|
||||||
res_key = res_item[0]
|
res_key = res_item[0]
|
||||||
res_val = res_item[1]
|
res_val = res_item[1]
|
||||||
if res_key in evals_result[key]:
|
if res_key in evals_result[key]:
|
||||||
@ -224,7 +225,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|||||||
elif i - best_iteration >= early_stopping_rounds:
|
elif i - best_iteration >= early_stopping_rounds:
|
||||||
best_msg = bst.attr('best_msg')
|
best_msg = bst.attr('best_msg')
|
||||||
if verbose_eval:
|
if verbose_eval:
|
||||||
rabit.tracker_print("Stopping. Best iteration:\n{}\n\n".format(best_msg))
|
msg = "Stopping. Best iteration:\n{}\n\n"
|
||||||
|
rabit.tracker_print(msg.format(best_msg))
|
||||||
break
|
break
|
||||||
# do checkpoint after evaluation, in case evaluation also updates booster.
|
# do checkpoint after evaluation, in case evaluation also updates booster.
|
||||||
bst.save_rabit_checkpoint()
|
bst.save_rabit_checkpoint()
|
||||||
@ -290,6 +292,7 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
|
|||||||
ret.append(CVPack(dtrain, dtest, plst))
|
ret.append(CVPack(dtrain, dtest, plst))
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def aggcv(rlist, show_stdv=True, verbose_eval=None, as_pandas=True, trial=0):
|
def aggcv(rlist, show_stdv=True, verbose_eval=None, as_pandas=True, trial=0):
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
"""
|
"""
|
||||||
@ -405,8 +408,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
|||||||
-------
|
-------
|
||||||
evaluation history : list(string)
|
evaluation history : list(string)
|
||||||
"""
|
"""
|
||||||
if stratified == True and not SKLEARN_INSTALLED:
|
if stratified is True and not SKLEARN_INSTALLED:
|
||||||
raise XGBoostError('sklearn needs to be installed in order to use stratified cv')
|
raise XGBoostError('sklearn needs to be installed in order to use stratified cv')
|
||||||
|
|
||||||
if isinstance(metrics, str):
|
if isinstance(metrics, str):
|
||||||
metrics = [metrics]
|
metrics = [metrics]
|
||||||
@ -417,7 +420,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
|||||||
if 'eval_metric' in params:
|
if 'eval_metric' in params:
|
||||||
params['eval_metric'] = _metrics
|
params['eval_metric'] = _metrics
|
||||||
else:
|
else:
|
||||||
params= dict((k, v) for k, v in params.items())
|
params = dict((k, v) for k, v in params.items())
|
||||||
|
|
||||||
if len(metrics) == 0 and 'eval_metric' in params:
|
if len(metrics) == 0 and 'eval_metric' in params:
|
||||||
if isinstance(params['eval_metric'], list):
|
if isinstance(params['eval_metric'], list):
|
||||||
@ -428,12 +431,14 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
|||||||
params.pop("eval_metric", None)
|
params.pop("eval_metric", None)
|
||||||
|
|
||||||
if early_stopping_rounds is not None:
|
if early_stopping_rounds is not None:
|
||||||
|
|
||||||
if len(metrics) > 1:
|
if len(metrics) > 1:
|
||||||
raise ValueError('Check your params. '\
|
msg = ('Check your params. '
|
||||||
'Early stopping works with single eval metric only.')
|
'Early stopping works with single eval metric only.')
|
||||||
|
raise ValueError(msg)
|
||||||
if verbose_eval:
|
if verbose_eval:
|
||||||
sys.stderr.write("Will train until cv error hasn't decreased in {} rounds.\n".format(\
|
msg = "Will train until cv error hasn't decreased in {} rounds.\n"
|
||||||
early_stopping_rounds))
|
sys.stderr.write(msg.format(early_stopping_rounds))
|
||||||
|
|
||||||
maximize_score = False
|
maximize_score = False
|
||||||
if len(metrics) == 1:
|
if len(metrics) == 1:
|
||||||
@ -466,10 +471,10 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
|||||||
best_score = score
|
best_score = score
|
||||||
best_score_i = i
|
best_score_i = i
|
||||||
elif i - best_score_i >= early_stopping_rounds:
|
elif i - best_score_i >= early_stopping_rounds:
|
||||||
results = results[:best_score_i+1]
|
results = results[:best_score_i + 1]
|
||||||
if verbose_eval:
|
if verbose_eval:
|
||||||
sys.stderr.write("Stopping. Best iteration:\n[{}] cv-mean:{}\tcv-std:{}\n".
|
msg = "Stopping. Best iteration:\n[{}] cv-mean:{}\tcv-std:{}\n"
|
||||||
format(best_score_i, results[-1][0], results[-1][1]))
|
sys.stderr.write(msg.format(best_score_i, results[-1][0], results[-1][1]))
|
||||||
break
|
break
|
||||||
if as_pandas:
|
if as_pandas:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -8,6 +8,7 @@ rng = np.random.RandomState(1994)
|
|||||||
|
|
||||||
|
|
||||||
class TestBasic(unittest.TestCase):
|
class TestBasic(unittest.TestCase):
|
||||||
|
|
||||||
def test_basic(self):
|
def test_basic(self):
|
||||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||||
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||||
@ -37,7 +38,7 @@ class TestBasic(unittest.TestCase):
|
|||||||
def test_multiclass(self):
|
def test_multiclass(self):
|
||||||
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||||
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
||||||
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'num_class' : 2}
|
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'num_class': 2}
|
||||||
# specify validations set to watch performance
|
# specify validations set to watch performance
|
||||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||||
num_round = 2
|
num_round = 2
|
||||||
@ -60,7 +61,6 @@ class TestBasic(unittest.TestCase):
|
|||||||
# assert they are the same
|
# assert they are the same
|
||||||
assert np.sum(np.abs(preds2 - preds)) == 0
|
assert np.sum(np.abs(preds2 - preds)) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_dmatrix_init(self):
|
def test_dmatrix_init(self):
|
||||||
data = np.random.randn(5, 5)
|
data = np.random.randn(5, 5)
|
||||||
|
|
||||||
|
|||||||
@ -8,82 +8,94 @@ dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
|
|||||||
|
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
|
|
||||||
|
|
||||||
class TestModels(unittest.TestCase):
|
class TestModels(unittest.TestCase):
|
||||||
|
|
||||||
def test_glm(self):
|
def test_glm(self):
|
||||||
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', 'alpha': 0.0001, 'lambda': 1 }
|
param = {'silent': 1, 'objective': 'binary:logistic',
|
||||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1}
|
||||||
num_round = 4
|
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
num_round = 4
|
||||||
assert isinstance(bst, xgb.core.Booster)
|
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||||
preds = bst.predict(dtest)
|
assert isinstance(bst, xgb.core.Booster)
|
||||||
labels = dtest.get_label()
|
preds = bst.predict(dtest)
|
||||||
err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
|
labels = dtest.get_label()
|
||||||
assert err < 0.1
|
err = sum(1 for i in range(len(preds))
|
||||||
|
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||||
|
assert err < 0.1
|
||||||
|
|
||||||
def test_eta_decay(self):
|
def test_eta_decay(self):
|
||||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
|
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
|
||||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||||
num_round = 2
|
num_round = 2
|
||||||
# learning_rates as a list
|
# learning_rates as a list
|
||||||
bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=[0.4, 0.3])
|
bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=[0.4, 0.3])
|
||||||
assert isinstance(bst, xgb.core.Booster)
|
assert isinstance(bst, xgb.core.Booster)
|
||||||
|
|
||||||
# learning_rates as a customized decay function
|
# learning_rates as a customized decay function
|
||||||
def eta_decay(ithround, num_boost_round):
|
def eta_decay(ithround, num_boost_round):
|
||||||
return num_boost_round / (ithround + 1)
|
return num_boost_round / (ithround + 1)
|
||||||
bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=eta_decay)
|
|
||||||
assert isinstance(bst, xgb.core.Booster)
|
|
||||||
|
|
||||||
|
bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=eta_decay)
|
||||||
|
assert isinstance(bst, xgb.core.Booster)
|
||||||
|
|
||||||
def test_custom_objective(self):
|
def test_custom_objective(self):
|
||||||
param = {'max_depth':2, 'eta':1, 'silent':1 }
|
param = {'max_depth': 2, 'eta': 1, 'silent': 1}
|
||||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||||
num_round = 2
|
num_round = 2
|
||||||
def logregobj(preds, dtrain):
|
|
||||||
labels = dtrain.get_label()
|
|
||||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
|
||||||
grad = preds - labels
|
|
||||||
hess = preds * (1.0-preds)
|
|
||||||
return grad, hess
|
|
||||||
def evalerror(preds, dtrain):
|
|
||||||
labels = dtrain.get_label()
|
|
||||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
|
||||||
|
|
||||||
# test custom_objective in training
|
def logregobj(preds, dtrain):
|
||||||
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
|
labels = dtrain.get_label()
|
||||||
assert isinstance(bst, xgb.core.Booster)
|
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||||
preds = bst.predict(dtest)
|
grad = preds - labels
|
||||||
labels = dtest.get_label()
|
hess = preds * (1.0 - preds)
|
||||||
err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
|
return grad, hess
|
||||||
assert err < 0.1
|
|
||||||
|
|
||||||
# test custom_objective in cross-validation
|
def evalerror(preds, dtrain):
|
||||||
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
|
labels = dtrain.get_label()
|
||||||
obj = logregobj, feval=evalerror)
|
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||||
|
|
||||||
# test maximize parameter
|
# test custom_objective in training
|
||||||
def neg_evalerror(preds, dtrain):
|
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
|
||||||
labels = dtrain.get_label()
|
assert isinstance(bst, xgb.core.Booster)
|
||||||
return 'error', float(sum(labels == (preds > 0.0))) / len(labels)
|
preds = bst.predict(dtest)
|
||||||
bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, neg_evalerror, maximize=True)
|
labels = dtest.get_label()
|
||||||
preds2 = bst2.predict(dtest)
|
err = sum(1 for i in range(len(preds))
|
||||||
err2 = sum(1 for i in range(len(preds2)) if int(preds2[i]>0.5)!=labels[i]) / float(len(preds2))
|
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||||
assert err == err2
|
assert err < 0.1
|
||||||
|
|
||||||
def test_fpreproc(self):
|
# test custom_objective in cross-validation
|
||||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
|
xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
|
||||||
num_round = 2
|
obj=logregobj, feval=evalerror)
|
||||||
def fpreproc(dtrain, dtest, param):
|
|
||||||
label = dtrain.get_label()
|
|
||||||
ratio = float(np.sum(label == 0)) / np.sum(label==1)
|
|
||||||
param['scale_pos_weight'] = ratio
|
|
||||||
return (dtrain, dtest, param)
|
|
||||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
|
||||||
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
|
|
||||||
|
|
||||||
def test_show_stdv(self):
|
# test maximize parameter
|
||||||
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
|
def neg_evalerror(preds, dtrain):
|
||||||
num_round = 2
|
labels = dtrain.get_label()
|
||||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
return 'error', float(sum(labels == (preds > 0.0))) / len(labels)
|
||||||
metrics={'error'}, seed = 0, show_stdv = False)
|
|
||||||
|
bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, neg_evalerror, maximize=True)
|
||||||
|
preds2 = bst2.predict(dtest)
|
||||||
|
err2 = sum(1 for i in range(len(preds2))
|
||||||
|
if int(preds2[i] > 0.5) != labels[i]) / float(len(preds2))
|
||||||
|
assert err == err2
|
||||||
|
|
||||||
|
def test_fpreproc(self):
|
||||||
|
param = {'max_depth': 2, 'eta': 1, 'silent': 1,
|
||||||
|
'objective': 'binary:logistic'}
|
||||||
|
num_round = 2
|
||||||
|
|
||||||
|
def fpreproc(dtrain, dtest, param):
|
||||||
|
label = dtrain.get_label()
|
||||||
|
ratio = float(np.sum(label == 0)) / np.sum(label == 1)
|
||||||
|
param['scale_pos_weight'] = ratio
|
||||||
|
return (dtrain, dtest, param)
|
||||||
|
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||||
|
metrics={'auc'}, seed=0, fpreproc=fpreproc)
|
||||||
|
|
||||||
|
def test_show_stdv(self):
|
||||||
|
param = {'max_depth': 2, 'eta': 1, 'silent': 1,
|
||||||
|
'objective': 'binary:logistic'}
|
||||||
|
num_round = 2
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||||
|
metrics={'error'}, seed=0, show_stdv=False)
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.datasets import load_digits
|
from sklearn.datasets import load_digits
|
||||||
from sklearn.cross_validation import KFold, train_test_split
|
from sklearn.cross_validation import train_test_split
|
||||||
from sklearn.metrics import mean_squared_error
|
from sklearn.metrics import mean_squared_error
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
@ -40,7 +40,6 @@ class TestEarlyStopping(unittest.TestCase):
|
|||||||
dm = xgb.DMatrix(X, label=y)
|
dm = xgb.DMatrix(X, label=y)
|
||||||
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
|
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10)
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10)
|
||||||
assert cv.shape[0] == 10
|
assert cv.shape[0] == 10
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5)
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5)
|
||||||
|
|||||||
@ -1,9 +1,8 @@
|
|||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.cross_validation import KFold, train_test_split
|
from sklearn.cross_validation import train_test_split
|
||||||
from sklearn.metrics import mean_squared_error
|
from sklearn.metrics import mean_squared_error
|
||||||
from sklearn.grid_search import GridSearchCV
|
from sklearn.datasets import load_digits
|
||||||
from sklearn.datasets import load_iris, load_digits, load_boston
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
rng = np.random.RandomState(1337)
|
rng = np.random.RandomState(1337)
|
||||||
|
|||||||
@ -12,6 +12,7 @@ matplotlib.use('Agg')
|
|||||||
dpath = 'demo/data/'
|
dpath = 'demo/data/'
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
|
|
||||||
|
|
||||||
class TestPlotting(unittest.TestCase):
|
class TestPlotting(unittest.TestCase):
|
||||||
def test_plotting(self):
|
def test_plotting(self):
|
||||||
bst2 = xgb.Booster(model_file='xgb.model')
|
bst2 = xgb.Booster(model_file='xgb.model')
|
||||||
|
|||||||
@ -1,10 +1,7 @@
|
|||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.preprocessing import MultiLabelBinarizer
|
|
||||||
from sklearn.cross_validation import KFold, train_test_split
|
|
||||||
from sklearn.metrics import mean_squared_error
|
from sklearn.metrics import mean_squared_error
|
||||||
from sklearn.grid_search import GridSearchCV
|
from sklearn.datasets import load_digits
|
||||||
from sklearn.datasets import load_iris, load_digits, load_boston
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
rng = np.random.RandomState(1337)
|
rng = np.random.RandomState(1337)
|
||||||
@ -57,10 +54,14 @@ class TestTrainingContinuation(unittest.TestCase):
|
|||||||
ntrees_02b = len(gbdt_02b.get_dump())
|
ntrees_02b = len(gbdt_02b.get_dump())
|
||||||
assert ntrees_02a == 10
|
assert ntrees_02a == 10
|
||||||
assert ntrees_02b == 10
|
assert ntrees_02b == 10
|
||||||
assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \
|
|
||||||
mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class))
|
res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class))
|
||||||
assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \
|
res2 = mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class))
|
||||||
mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class))
|
assert res1 == res2
|
||||||
|
|
||||||
|
res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class))
|
||||||
|
res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class))
|
||||||
|
assert res1 == res2
|
||||||
|
|
||||||
gbdt_03 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=3)
|
gbdt_03 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=3)
|
||||||
gbdt_03.save_model('xgb_tc.model')
|
gbdt_03.save_model('xgb_tc.model')
|
||||||
@ -71,22 +72,30 @@ class TestTrainingContinuation(unittest.TestCase):
|
|||||||
ntrees_03b = len(gbdt_03b.get_dump())
|
ntrees_03b = len(gbdt_03b.get_dump())
|
||||||
assert ntrees_03a == 10
|
assert ntrees_03a == 10
|
||||||
assert ntrees_03b == 10
|
assert ntrees_03b == 10
|
||||||
assert mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class)) == \
|
|
||||||
mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
|
res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class))
|
||||||
|
res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
|
||||||
|
assert res1 == res2
|
||||||
|
|
||||||
gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=3)
|
gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=3)
|
||||||
assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree
|
assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree
|
||||||
assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \
|
|
||||||
mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit))
|
res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
|
||||||
|
res2 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit))
|
||||||
|
assert res1 == res2
|
||||||
|
|
||||||
gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04)
|
gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04)
|
||||||
assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree
|
assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree
|
||||||
assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \
|
|
||||||
mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit))
|
res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
|
||||||
|
res2 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit))
|
||||||
|
assert res1 == res2
|
||||||
|
|
||||||
gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=7)
|
gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=7)
|
||||||
assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree
|
assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree
|
||||||
gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05)
|
gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05)
|
||||||
assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree
|
assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree
|
||||||
assert np.any(gbdt_05.predict(dtrain_5class) !=
|
|
||||||
gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit)) == False
|
res1 = gbdt_05.predict(dtrain_5class)
|
||||||
|
res2 = gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit)
|
||||||
|
np.testing.assert_almost_equal(res1, res2)
|
||||||
|
|||||||
@ -111,43 +111,55 @@ class TestPandas(unittest.TestCase):
|
|||||||
u'train-error-mean', u'train-error-std'])
|
u'train-error-mean', u'train-error-std'])
|
||||||
assert cv.columns.equals(exp)
|
assert cv.columns.equals(exp)
|
||||||
|
|
||||||
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc'}
|
params = {'max_depth': 2, 'eta': 1, 'silent': 1,
|
||||||
|
'objective': 'binary:logistic', 'eval_metric': 'auc'}
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True)
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True)
|
||||||
assert 'eval_metric' in params
|
assert 'eval_metric' in params
|
||||||
assert 'auc' in cv.columns[0]
|
assert 'auc' in cv.columns[0]
|
||||||
|
|
||||||
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']}
|
params = {'max_depth': 2, 'eta': 1, 'silent': 1,
|
||||||
|
'objective': 'binary:logistic', 'eval_metric': ['auc']}
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True)
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True)
|
||||||
assert 'eval_metric' in params
|
assert 'eval_metric' in params
|
||||||
assert 'auc' in cv.columns[0]
|
assert 'auc' in cv.columns[0]
|
||||||
|
|
||||||
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']}
|
params = {'max_depth': 2, 'eta': 1, 'silent': 1,
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, early_stopping_rounds=1)
|
'objective': 'binary:logistic', 'eval_metric': ['auc']}
|
||||||
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
|
||||||
|
as_pandas=True, early_stopping_rounds=1)
|
||||||
assert 'eval_metric' in params
|
assert 'eval_metric' in params
|
||||||
assert 'auc' in cv.columns[0]
|
assert 'auc' in cv.columns[0]
|
||||||
assert cv.shape[0] < 10
|
assert cv.shape[0] < 10
|
||||||
|
|
||||||
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
|
params = {'max_depth': 2, 'eta': 1, 'silent': 1,
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='auc')
|
'objective': 'binary:logistic'}
|
||||||
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
|
||||||
|
as_pandas=True, metrics='auc')
|
||||||
assert 'auc' in cv.columns[0]
|
assert 'auc' in cv.columns[0]
|
||||||
|
|
||||||
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
|
params = {'max_depth': 2, 'eta': 1, 'silent': 1,
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['auc'])
|
'objective': 'binary:logistic'}
|
||||||
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
|
||||||
|
as_pandas=True, metrics=['auc'])
|
||||||
assert 'auc' in cv.columns[0]
|
assert 'auc' in cv.columns[0]
|
||||||
|
|
||||||
params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']}
|
params = {'max_depth': 2, 'eta': 1, 'silent': 1,
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='error')
|
'objective': 'binary:logistic', 'eval_metric': ['auc']}
|
||||||
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
|
||||||
|
as_pandas=True, metrics='error')
|
||||||
assert 'eval_metric' in params
|
assert 'eval_metric' in params
|
||||||
assert 'auc' not in cv.columns[0]
|
assert 'auc' not in cv.columns[0]
|
||||||
assert 'error' in cv.columns[0]
|
assert 'error' in cv.columns[0]
|
||||||
|
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error'])
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
|
||||||
|
as_pandas=True, metrics=['error'])
|
||||||
assert 'eval_metric' in params
|
assert 'eval_metric' in params
|
||||||
assert 'auc' not in cv.columns[0]
|
assert 'auc' not in cv.columns[0]
|
||||||
assert 'error' in cv.columns[0]
|
assert 'error' in cv.columns[0]
|
||||||
|
|
||||||
params = list(params.items())
|
params = list(params.items())
|
||||||
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error'])
|
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
|
||||||
|
as_pandas=True, metrics=['error'])
|
||||||
assert isinstance(params, list)
|
assert isinstance(params, list)
|
||||||
assert 'auc' not in cv.columns[0]
|
assert 'auc' not in cv.columns[0]
|
||||||
assert 'error' in cv.columns[0]
|
assert 'error' in cv.columns[0]
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.cross_validation import KFold
|
|
||||||
from sklearn.metrics import mean_squared_error
|
from sklearn.metrics import mean_squared_error
|
||||||
from sklearn.grid_search import GridSearchCV
|
from sklearn.grid_search import GridSearchCV
|
||||||
from sklearn.datasets import load_iris, load_digits, load_boston
|
from sklearn.datasets import load_iris, load_digits, load_boston
|
||||||
@ -8,33 +7,46 @@ from sklearn.cross_validation import KFold, StratifiedKFold, train_test_split
|
|||||||
|
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
|
|
||||||
|
|
||||||
def test_binary_classification():
|
def test_binary_classification():
|
||||||
digits = load_digits(2)
|
digits = load_digits(2)
|
||||||
y = digits['target']
|
y = digits['target']
|
||||||
X = digits['data']
|
X = digits['data']
|
||||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||||
for train_index, test_index in kf:
|
for train_index, test_index in kf:
|
||||||
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
|
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
|
||||||
preds = xgb_model.predict(X[test_index])
|
preds = xgb_model.predict(X[test_index])
|
||||||
labels = y[test_index]
|
labels = y[test_index]
|
||||||
err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
|
err = sum(1 for i in range(len(preds))
|
||||||
assert err < 0.1
|
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||||
|
assert err < 0.1
|
||||||
|
|
||||||
|
|
||||||
def test_multiclass_classification():
|
def test_multiclass_classification():
|
||||||
|
|
||||||
|
def check_pred(preds, labels):
|
||||||
|
err = sum(1 for i in range(len(preds))
|
||||||
|
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||||
|
assert err < 0.4
|
||||||
|
|
||||||
iris = load_iris()
|
iris = load_iris()
|
||||||
y = iris['target']
|
y = iris['target']
|
||||||
X = iris['data']
|
X = iris['data']
|
||||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||||
for train_index, test_index in kf:
|
for train_index, test_index in kf:
|
||||||
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
|
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
|
||||||
preds = xgb_model.predict(X[test_index])
|
preds = xgb_model.predict(X[test_index])
|
||||||
# test other params in XGBClassifier().fit
|
# test other params in XGBClassifier().fit
|
||||||
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
|
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
|
||||||
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
|
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
|
||||||
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
|
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
|
||||||
labels = y[test_index]
|
labels = y[test_index]
|
||||||
err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
|
|
||||||
assert err < 0.4
|
check_pred(preds, labels)
|
||||||
|
check_pred(preds2, labels)
|
||||||
|
check_pred(preds3, labels)
|
||||||
|
check_pred(preds4, labels)
|
||||||
|
|
||||||
|
|
||||||
def test_boston_housing_regression():
|
def test_boston_housing_regression():
|
||||||
boston = load_boston()
|
boston = load_boston()
|
||||||
@ -42,27 +54,33 @@ def test_boston_housing_regression():
|
|||||||
X = boston['data']
|
X = boston['data']
|
||||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||||
for train_index, test_index in kf:
|
for train_index, test_index in kf:
|
||||||
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
|
xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
|
||||||
|
|
||||||
preds = xgb_model.predict(X[test_index])
|
preds = xgb_model.predict(X[test_index])
|
||||||
# test other params in XGBRegressor().fit
|
# test other params in XGBRegressor().fit
|
||||||
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
|
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
|
||||||
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
|
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
|
||||||
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
|
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
|
||||||
labels = y[test_index]
|
labels = y[test_index]
|
||||||
assert mean_squared_error(preds, labels) < 25
|
|
||||||
|
assert mean_squared_error(preds, labels) < 25
|
||||||
|
assert mean_squared_error(preds2, labels) < 350
|
||||||
|
assert mean_squared_error(preds3, labels) < 25
|
||||||
|
assert mean_squared_error(preds4, labels) < 350
|
||||||
|
|
||||||
|
|
||||||
def test_parameter_tuning():
|
def test_parameter_tuning():
|
||||||
boston = load_boston()
|
boston = load_boston()
|
||||||
y = boston['target']
|
y = boston['target']
|
||||||
X = boston['data']
|
X = boston['data']
|
||||||
xgb_model = xgb.XGBRegressor()
|
xgb_model = xgb.XGBRegressor()
|
||||||
clf = GridSearchCV(xgb_model,
|
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
|
||||||
{'max_depth': [2,4,6],
|
'n_estimators': [50, 100, 200]}, verbose=1)
|
||||||
'n_estimators': [50,100,200]}, verbose=1)
|
clf.fit(X, y)
|
||||||
clf.fit(X,y)
|
|
||||||
assert clf.best_score_ < 0.7
|
assert clf.best_score_ < 0.7
|
||||||
assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
|
assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
|
||||||
|
|
||||||
|
|
||||||
def test_regression_with_custom_objective():
|
def test_regression_with_custom_objective():
|
||||||
def objective_ls(y_true, y_pred):
|
def objective_ls(y_true, y_pred):
|
||||||
grad = (y_pred - y_true)
|
grad = (y_pred - y_true)
|
||||||
@ -86,20 +104,17 @@ def test_regression_with_custom_objective():
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def dummy_objective(y_true, y_pred):
|
def dummy_objective(y_true, y_pred):
|
||||||
raise XGBCustomObjectiveException()
|
raise XGBCustomObjectiveException()
|
||||||
|
|
||||||
xgb_model = xgb.XGBRegressor(objective=dummy_objective)
|
xgb_model = xgb.XGBRegressor(objective=dummy_objective)
|
||||||
np.testing.assert_raises(
|
np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y)
|
||||||
XGBCustomObjectiveException,
|
|
||||||
xgb_model.fit,
|
|
||||||
X, y
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_classification_with_custom_objective():
|
def test_classification_with_custom_objective():
|
||||||
def logregobj(y_true, y_pred):
|
def logregobj(y_true, y_pred):
|
||||||
y_pred = 1.0 / (1.0 + np.exp(-y_pred))
|
y_pred = 1.0 / (1.0 + np.exp(-y_pred))
|
||||||
grad = y_pred - y_true
|
grad = y_pred - y_true
|
||||||
hess = y_pred * (1.0-y_pred)
|
hess = y_pred * (1.0 - y_pred)
|
||||||
return grad, hess
|
return grad, hess
|
||||||
|
|
||||||
digits = load_digits(2)
|
digits = load_digits(2)
|
||||||
@ -107,22 +122,20 @@ def test_classification_with_custom_objective():
|
|||||||
X = digits['data']
|
X = digits['data']
|
||||||
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
|
||||||
for train_index, test_index in kf:
|
for train_index, test_index in kf:
|
||||||
xgb_model = xgb.XGBClassifier(objective=logregobj).fit(
|
xgb_model = xgb.XGBClassifier(objective=logregobj)
|
||||||
X[train_index],y[train_index]
|
xgb_model.fit(X[train_index], y[train_index])
|
||||||
)
|
|
||||||
preds = xgb_model.predict(X[test_index])
|
preds = xgb_model.predict(X[test_index])
|
||||||
labels = y[test_index]
|
labels = y[test_index]
|
||||||
err = sum(1 for i in range(len(preds))
|
err = sum(1 for i in range(len(preds))
|
||||||
if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
|
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||||
assert err < 0.1
|
assert err < 0.1
|
||||||
|
|
||||||
|
|
||||||
# Test that the custom objective function is actually used
|
# Test that the custom objective function is actually used
|
||||||
class XGBCustomObjectiveException(Exception):
|
class XGBCustomObjectiveException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def dummy_objective(y_true, y_preds):
|
def dummy_objective(y_true, y_preds):
|
||||||
raise XGBCustomObjectiveException()
|
raise XGBCustomObjectiveException()
|
||||||
|
|
||||||
xgb_model = xgb.XGBClassifier(objective=dummy_objective)
|
xgb_model = xgb.XGBClassifier(objective=dummy_objective)
|
||||||
np.testing.assert_raises(
|
np.testing.assert_raises(
|
||||||
@ -131,6 +144,7 @@ def test_classification_with_custom_objective():
|
|||||||
X, y
|
X, y
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_sklearn_api():
|
def test_sklearn_api():
|
||||||
iris = load_iris()
|
iris = load_iris()
|
||||||
tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120)
|
tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120)
|
||||||
@ -143,6 +157,7 @@ def test_sklearn_api():
|
|||||||
err = sum([1 for p, l in zip(preds, labels) if p != l]) / len(te_l)
|
err = sum([1 for p, l in zip(preds, labels) if p != l]) / len(te_l)
|
||||||
assert err < 0.2
|
assert err < 0.2
|
||||||
|
|
||||||
|
|
||||||
def test_sklearn_plotting():
|
def test_sklearn_plotting():
|
||||||
iris = load_iris()
|
iris = load_iris()
|
||||||
|
|
||||||
@ -168,12 +183,13 @@ def test_sklearn_plotting():
|
|||||||
ax = xgb.plot_tree(classifier, num_trees=0)
|
ax = xgb.plot_tree(classifier, num_trees=0)
|
||||||
assert isinstance(ax, Axes)
|
assert isinstance(ax, Axes)
|
||||||
|
|
||||||
|
|
||||||
def test_sklearn_nfolds_cv():
|
def test_sklearn_nfolds_cv():
|
||||||
digits = load_digits(3)
|
digits = load_digits(3)
|
||||||
X = digits['data']
|
X = digits['data']
|
||||||
y = digits['target']
|
y = digits['target']
|
||||||
dm = xgb.DMatrix(X, label=y)
|
dm = xgb.DMatrix(X, label=y)
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
'max_depth': 2,
|
'max_depth': 2,
|
||||||
'eta': 1,
|
'eta': 1,
|
||||||
@ -187,9 +203,8 @@ def test_sklearn_nfolds_cv():
|
|||||||
nfolds = 5
|
nfolds = 5
|
||||||
skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed)
|
skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed)
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed)
|
cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed)
|
||||||
cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed)
|
cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed)
|
||||||
cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed)
|
cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed)
|
||||||
assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0]
|
assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0]
|
||||||
assert cv2.iloc[-1,0] == cv3.iloc[-1,0]
|
assert cv2.iloc[-1, 0] == cv3.iloc[-1, 0]
|
||||||
|
|||||||
@ -52,6 +52,9 @@ if [ ${TASK} == "python_lightweight_test" ]; then
|
|||||||
conda install numpy scipy nose
|
conda install numpy scipy nose
|
||||||
python -m pip install graphviz
|
python -m pip install graphviz
|
||||||
python -m nose tests/python/test_basic*.py || exit -1
|
python -m nose tests/python/test_basic*.py || exit -1
|
||||||
|
python -m pip install flake8
|
||||||
|
flake8 --ignore E501 python-package || exit -1
|
||||||
|
flake8 --ignore E501 tests/python || exit -1
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user