From bbe0dbd7ec781147327d5326700fbda8b70f2d61 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Sun, 21 Apr 2019 01:01:54 -0700 Subject: [PATCH] Migrate pylint check to Python 3 (#4381) * Migrate lint to Python 3 * Fix lint errors * Use Miniconda3 to use Python 3.7 * Use latest pylint and astroid --- Makefile | 4 +- python-package/.pylintrc | 2 +- python-package/xgboost/callback.py | 22 +++-- python-package/xgboost/compat.py | 11 ++- python-package/xgboost/core.py | 129 ++++++++++++++--------------- python-package/xgboost/libpath.py | 1 - python-package/xgboost/plotting.py | 4 +- python-package/xgboost/sklearn.py | 33 ++++---- python-package/xgboost/training.py | 10 +-- tests/travis/run_test.sh | 3 + tests/travis/setup.sh | 16 +++- 11 files changed, 118 insertions(+), 117 deletions(-) diff --git a/Makefile b/Makefile index ecaf15077..6c8477186 100644 --- a/Makefile +++ b/Makefile @@ -173,10 +173,10 @@ xgboost: $(CLI_OBJ) $(ALL_DEP) $(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS) rcpplint: - python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} R-package/src + python3 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} R-package/src lint: rcpplint - python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src plugin python-package + python3 dmlc-core/scripts/lint.py --pylint-rc ${PWD}/python-package/.pylintrc xgboost ${LINT_LANG} include src plugin python-package pylint: flake8 --ignore E501 python-package diff --git a/python-package/.pylintrc b/python-package/.pylintrc index b224023db..8049662c7 100644 --- a/python-package/.pylintrc +++ b/python-package/.pylintrc @@ -4,7 +4,7 @@ ignore=tests extension-pkg-whitelist=numpy -disiable=unexpected-special-method-signature,too-many-nested-blocks +disable=unexpected-special-method-signature,too-many-nested-blocks,useless-object-inheritance dummy-variables-rgx=(unused|)_.* diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py index a4bff8113..2c6d3a704 100644 --- a/python-package/xgboost/callback.py +++ b/python-package/xgboost/callback.py @@ -1,5 +1,5 @@ # coding: utf-8 -# pylint: disable= invalid-name +# pylint: disable=invalid-name, too-many-statements """Training Library containing training routines.""" from __future__ import absolute_import @@ -20,13 +20,11 @@ def _fmt_metric(value, show_stdv=True): """format metric string""" if len(value) == 2: return '%s:%g' % (value[0], value[1]) - elif len(value) == 3: + if len(value) == 3: if show_stdv: return '%s:%g+%g' % (value[0], value[1], value[2]) - else: - return '%s:%g' % (value[0], value[1]) - else: - raise ValueError("wrong metric value") + return '%s:%g' % (value[0], value[1]) + raise ValueError("wrong metric value") def print_evaluation(period=1, show_stdv=True): @@ -50,10 +48,10 @@ def print_evaluation(period=1, show_stdv=True): """ def callback(env): """internal function""" - if env.rank != 0 or len(env.evaluation_result_list) == 0 or period is False or period == 0: + if env.rank != 0 or (not env.evaluation_result_list) or period is False or period == 0: return i = env.iteration - if (i % period == 0 or i + 1 == env.begin_iteration or i + 1 == env.end_iteration): + if i % period == 0 or i + 1 == env.begin_iteration or i + 1 == env.end_iteration: msg = '\t'.join([_fmt_metric(x, show_stdv) for x in env.evaluation_result_list]) rabit.tracker_print('[%d]\t%s\n' % (i, msg)) return callback @@ -89,7 +87,7 @@ def record_evaluation(eval_result): def callback(env): """internal function""" - if len(eval_result) == 0: + if not eval_result: init(env) for k, v in env.evaluation_result_list: pos = k.index('-') @@ -182,14 +180,14 @@ def early_stop(stopping_rounds, maximize=False, verbose=True): """internal function""" bst = env.model - if len(env.evaluation_result_list) == 0: + if not env.evaluation_result_list: raise ValueError('For early stopping you need at least one set in evals.') if len(env.evaluation_result_list) > 1 and verbose: msg = ("Multiple eval metrics have been passed: " "'{0}' will be used for early stopping.\n\n") rabit.tracker_print(msg.format(env.evaluation_result_list[-1][0])) maximize_metrics = ('auc', 'aucpr', 'map', 'ndcg') - maximize_at_n_metrics = ('auc@', 'aucpr@' 'map@', 'ndcg@') + maximize_at_n_metrics = ('auc@', 'aucpr@', 'map@', 'ndcg@') maximize_score = maximize metric_label = env.evaluation_result_list[-1][0] metric = metric_label.split('-', 1)[-1] @@ -225,7 +223,7 @@ def early_stop(stopping_rounds, maximize=False, verbose=True): def callback(env): """internal function""" score = env.evaluation_result_list[-1][1] - if len(state) == 0: + if not state: init(env) best_score = state['best_score'] best_iteration = state['best_iteration'] diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 4b868d7bf..245f7fd66 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -11,14 +11,13 @@ PY3 = (sys.version_info[0] == 3) if PY3: # pylint: disable=invalid-name, redefined-builtin - STRING_TYPES = str, + STRING_TYPES = (str,) def py_str(x): """convert c string back to python string""" return x.decode('utf-8') else: - # pylint: disable=invalid-name - STRING_TYPES = basestring, + STRING_TYPES = (basestring,) # pylint: disable=undefined-variable def py_str(x): """convert c string back to python string""" @@ -37,13 +36,13 @@ try: PANDAS_INSTALLED = True except ImportError: + # pylint: disable=too-few-public-methods class MultiIndex(object): """ dummy for pandas.MultiIndex """ - pass + # pylint: disable=too-few-public-methods class DataFrame(object): """ dummy for pandas.DataFrame """ - pass PANDAS_INSTALLED = False @@ -57,9 +56,9 @@ try: DT_INSTALLED = True except ImportError: + # pylint: disable=too-few-public-methods class DataTable(object): """ dummy for datatable.DataTable """ - pass DT_INSTALLED = False diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 1cf3f473d..ff1474037 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1,6 +1,6 @@ # coding: utf-8 # pylint: disable=too-many-arguments, too-many-branches, invalid-name -# pylint: disable=too-many-branches, too-many-lines, W0141 +# pylint: disable=too-many-branches, too-many-lines, too-many-locals """Core XGBoost Library.""" from __future__ import absolute_import import collections @@ -30,7 +30,6 @@ c_bst_ulong = ctypes.c_uint64 class XGBoostError(Exception): """Error thrown by xgboost trainer.""" - pass class EarlyStopException(Exception): @@ -67,18 +66,16 @@ def from_pystr_to_cstr(data): list of str """ - if isinstance(data, list): - pointers = (ctypes.c_char_p * len(data))() - if PY3: - data = [bytes(d, 'utf-8') for d in data] - else: - data = [d.encode('utf-8') if isinstance(d, unicode) else d - for d in data] - pointers[:] = data - return pointers - else: - # copy from above when we actually use it + if not isinstance(data, list): raise NotImplementedError + pointers = (ctypes.c_char_p * len(data))() + if PY3: + data = [bytes(d, 'utf-8') for d in data] + else: + data = [d.encode('utf-8') if isinstance(d, unicode) else d # pylint: disable=undefined-variable + for d in data] + pointers[:] = data + return pointers def from_cstr_to_pystr(data, length): @@ -104,6 +101,7 @@ def from_cstr_to_pystr(data, length): try: res.append(str(data[i].decode('ascii'))) except UnicodeDecodeError: + # pylint: disable=undefined-variable res.append(unicode(data[i].decode('utf-8'))) return res @@ -123,7 +121,7 @@ def _get_log_callback_func(): def _load_lib(): """Load xgboost Library.""" lib_paths = find_lib_path() - if len(lib_paths) == 0: + if not lib_paths: return None try: pathBackup = os.environ['PATH'].split(os.pathsep) @@ -243,7 +241,7 @@ def _maybe_pandas_data(data, feature_names, feature_types): if feature_names is None: if isinstance(data.columns, MultiIndex): feature_names = [ - ' '.join(map(str, i)) + ' '.join([str(x) for x in i]) for i in data.columns ] else: @@ -267,8 +265,7 @@ def _maybe_pandas_label(label): label_dtypes = label.dtypes if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes): raise ValueError('DataFrame.dtypes for label must be int, float or bool') - else: - label = label.values.astype('float') + label = label.values.astype('float') # pd.Series can be passed to xgb as it is return label @@ -301,8 +298,7 @@ def _maybe_dt_data(data, feature_names, feature_types): # always return stypes for dt ingestion if feature_types is not None: raise ValueError('DataTable has own feature types, cannot pass them in') - else: - feature_types = np.vectorize(DT_TYPE_MAPPER2.get)(data_types_names) + feature_types = np.vectorize(DT_TYPE_MAPPER2.get)(data_types_names) return data, feature_names, feature_types @@ -512,7 +508,7 @@ class DMatrix(object): ptrs[icol] = ctypes.c_void_p(ptr) else: # datatable<=0.8.0 - from datatable.internal import frame_column_data_r + from datatable.internal import frame_column_data_r # pylint: disable=no-name-in-module,import-error for icol in range(data.ncols): ptrs[icol] = frame_column_data_r(data, icol) @@ -1039,8 +1035,7 @@ class Booster(object): self.handle, c_str(key), ctypes.byref(ret), ctypes.byref(success))) if success.value != 0: return py_str(ret.value) - else: - return None + return None def attributes(self): """Get attributes stored in the Booster as a dictionary. @@ -1056,8 +1051,7 @@ class Booster(object): ctypes.byref(length), ctypes.byref(sarr))) attr_names = from_cstr_to_pystr(sarr, length) - res = dict([(n, self.attr(n)) for n in attr_names]) - return res + return {n: self.attr(n) for n in attr_names} def set_attr(self, **kwargs): """Set the attribute of the Booster. @@ -1399,13 +1393,13 @@ class Booster(object): ret = self.get_dump(fmap, with_stats, dump_format) if dump_format == 'json': fout.write('[\n') - for i in range(len(ret)): + for i, _ in enumerate(ret): fout.write(ret[i]) if i < len(ret) - 1: fout.write(",\n") fout.write('\n]') else: - for i in range(len(ret)): + for i, _ in enumerate(ret): fout.write('booster[{}]:\n'.format(i)) fout.write(ret[i]) if need_close: @@ -1538,51 +1532,50 @@ class Booster(object): return fmap - else: - average_over_splits = True - if importance_type == 'total_gain': - importance_type = 'gain' - average_over_splits = False - elif importance_type == 'total_cover': - importance_type = 'cover' - average_over_splits = False + average_over_splits = True + if importance_type == 'total_gain': + importance_type = 'gain' + average_over_splits = False + elif importance_type == 'total_cover': + importance_type = 'cover' + average_over_splits = False - trees = self.get_dump(fmap, with_stats=True) + trees = self.get_dump(fmap, with_stats=True) - importance_type += '=' - fmap = {} - gmap = {} - for tree in trees: - for line in tree.split('\n'): - # look for the opening square bracket - arr = line.split('[') - # if no opening bracket (leaf node), ignore this line - if len(arr) == 1: - continue + importance_type += '=' + fmap = {} + gmap = {} + for tree in trees: + for line in tree.split('\n'): + # look for the opening square bracket + arr = line.split('[') + # if no opening bracket (leaf node), ignore this line + if len(arr) == 1: + continue - # look for the closing bracket, extract only info within that bracket - fid = arr[1].split(']') + # look for the closing bracket, extract only info within that bracket + fid = arr[1].split(']') - # extract gain or cover from string after closing bracket - g = float(fid[1].split(importance_type)[1].split(',')[0]) + # extract gain or cover from string after closing bracket + g = float(fid[1].split(importance_type)[1].split(',')[0]) - # extract feature name from string before closing bracket - fid = fid[0].split('<')[0] + # extract feature name from string before closing bracket + fid = fid[0].split('<')[0] - if fid not in fmap: - # if the feature hasn't been seen yet - fmap[fid] = 1 - gmap[fid] = g - else: - fmap[fid] += 1 - gmap[fid] += g + if fid not in fmap: + # if the feature hasn't been seen yet + fmap[fid] = 1 + gmap[fid] = g + else: + fmap[fid] += 1 + gmap[fid] += g - # calculate average value (gain/cover) for each feature - if average_over_splits: - for fid in gmap: - gmap[fid] = gmap[fid] / fmap[fid] + # calculate average value (gain/cover) for each feature + if average_over_splits: + for fid in gmap: + gmap[fid] = gmap[fid] / fmap[fid] - return gmap + return gmap def trees_to_dataframe(self, fmap=''): """Parse a boosted tree model text dump into a pandas DataFrame structure. @@ -1721,9 +1714,9 @@ class Booster(object): xgdump = self.get_dump(fmap=fmap) values = [] regexp = re.compile(r"\[{0}<([\d.Ee+-]+)\]".format(feature)) - for i in range(len(xgdump)): + for i, _ in enumerate(xgdump): m = re.findall(regexp, xgdump[i]) - values.extend(map(float, m)) + values.extend([float(x) for x in m]) n_unique = len(np.unique(values)) bins = max(min(n_unique, bins) if bins is not None else n_unique, 1) @@ -1734,9 +1727,7 @@ class Booster(object): if as_pandas and PANDAS_INSTALLED: return DataFrame(nph, columns=['SplitValue', 'Count']) - elif as_pandas and not PANDAS_INSTALLED: + if as_pandas and not PANDAS_INSTALLED: sys.stderr.write( "Returning histogram as ndarray (as_pandas == True, but pandas is not installed).") - return nph - else: - return nph + return nph diff --git a/python-package/xgboost/libpath.py b/python-package/xgboost/libpath.py index d87922c01..4d276050a 100644 --- a/python-package/xgboost/libpath.py +++ b/python-package/xgboost/libpath.py @@ -8,7 +8,6 @@ import sys class XGBoostLibraryNotFound(Exception): """Error thrown by when xgboost is not found""" - pass def find_lib_path(): diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index 8e69dc10c..9a18d5587 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -55,7 +55,6 @@ def plot_importance(booster, ax=None, height=0.2, ------- ax : matplotlib Axes """ - # TODO: move this to compat.py try: import matplotlib.pyplot as plt except ImportError: @@ -70,11 +69,12 @@ def plot_importance(booster, ax=None, height=0.2, else: raise ValueError('tree must be Booster, XGBModel or dict instance') - if len(importance) == 0: + if not importance: raise ValueError('Booster.get_score() results in empty') tuples = [(k, importance[k]) for k in importance] if max_num_features is not None: + # pylint: disable=invalid-unary-operand-type tuples = sorted(tuples, key=lambda x: x[1])[-max_num_features:] else: tuples = sorted(tuples, key=lambda x: x[1]) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 2f8dc5dbd..ae4751ea7 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -3,9 +3,9 @@ """Scikit-Learn Wrapper interface for XGBoost.""" from __future__ import absolute_import -import numpy as np import warnings import json +import numpy as np from .core import Booster, DMatrix, XGBoostError from .training import train @@ -107,15 +107,15 @@ class XGBModel(XGBModelBase): importance_type: string, default "gain" The feature importance type for the feature_importances_ property: either "gain", "weight", "cover", "total_gain" or "total_cover". - \*\*kwargs : dict, optional + \\*\\*kwargs : dict, optional Keyword arguments for XGBoost Booster object. Full documentation of parameters can be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst. - Attempting to set a parameter via the constructor args and \*\*kwargs dict simultaneously + Attempting to set a parameter via the constructor args and \\*\\*kwargs dict simultaneously will result in a TypeError. - .. note:: \*\*kwargs unsupported by scikit-learn + .. note:: \\*\\*kwargs unsupported by scikit-learn - \*\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters + \\*\\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters passed via this argument will interact properly with scikit-learn. Note @@ -597,7 +597,7 @@ class XGBModel(XGBModelBase): class XGBClassifier(XGBModel, XGBClassifierBase): - # pylint: disable=missing-docstring,too-many-arguments,invalid-name + # pylint: disable=missing-docstring,too-many-arguments,invalid-name,too-many-instance-attributes __doc__ = "Implementation of the scikit-learn API for XGBoost classification.\n\n" \ + '\n'.join(XGBModel.__doc__.split('\n')[2:]) @@ -834,10 +834,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase): validate_features=validate_features) if self.objective == "multi:softprob": return class_probs - else: - classone_probs = class_probs - classzero_probs = 1.0 - classone_probs - return np.vstack((classzero_probs, classone_probs)).transpose() + classone_probs = class_probs + classzero_probs = 1.0 - classone_probs + return np.vstack((classzero_probs, classone_probs)).transpose() def evals_result(self): """Return the evaluation results. @@ -1008,15 +1007,15 @@ class XGBRanker(XGBModel): missing : float, optional Value in the data which needs to be present as a missing value. If None, defaults to np.nan. - \*\*kwargs : dict, optional + \\*\\*kwargs : dict, optional Keyword arguments for XGBoost Booster object. Full documentation of parameters can be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst. - Attempting to set a parameter via the constructor args and \*\*kwargs dict + Attempting to set a parameter via the constructor args and \\*\\*kwargs dict simultaneously will result in a TypeError. - .. note:: \*\*kwargs unsupported by scikit-learn + .. note:: \\*\\*kwargs unsupported by scikit-learn - \*\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters + \\*\\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters passed via this argument will interact properly with scikit-learn. Note @@ -1073,7 +1072,7 @@ class XGBRanker(XGBModel): random_state=random_state, seed=seed, missing=missing, **kwargs) if callable(self.objective): raise ValueError("custom objective function not supported by XGBRanker") - elif "rank:" not in self.objective: + if "rank:" not in self.objective: raise ValueError("please use XGBRanker for ranking task") def fit(self, X, y, group, sample_weight=None, eval_set=None, sample_weight_eval_set=None, @@ -1158,9 +1157,9 @@ class XGBRanker(XGBModel): if eval_set is not None: if eval_group is None: raise ValueError("eval_group is required if eval_set is not None") - elif len(eval_group) != len(eval_set): + if len(eval_group) != len(eval_set): raise ValueError("length of eval_group should match that of eval_set") - elif any(group is None for group in eval_group): + if any(group is None for group in eval_group): raise ValueError("group is required for all eval datasets for ranking task") def _dmat_init(group, **params): diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index bca0988e6..71c96736a 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -49,7 +49,7 @@ def _train_internal(params, dtrain, # Distributed code: Load the checkpoint from rabit. version = bst.load_rabit_checkpoint() - assert(rabit.get_world_size() != 1 or version == 0) + assert rabit.get_world_size() != 1 or version == 0 rank = rabit.get_rank() start_iteration = int(version / 2) nboost += start_iteration @@ -75,12 +75,12 @@ def _train_internal(params, dtrain, bst.save_rabit_checkpoint() version += 1 - assert(rabit.get_world_size() == 1 or version == rabit.version_number()) + assert rabit.get_world_size() == 1 or version == rabit.version_number() nboost += 1 evaluation_result_list = [] # check evaluation result. - if len(evals) != 0: + if evals: bst_eval_set = bst.eval_set(evals, i, feval) if isinstance(bst_eval_set, STRING_TYPES): msg = bst_eval_set @@ -402,7 +402,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None else: params = dict((k, v) for k, v in params.items()) - if len(metrics) == 0 and 'eval_metric' in params: + if (not metrics) and 'eval_metric' in params: if isinstance(params['eval_metric'], list): metrics = params['eval_metric'] else: @@ -462,7 +462,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None rank=0, evaluation_result_list=res)) except EarlyStopException as e: - for k in results.keys(): + for k in results: results[k] = results[k][:(e.best_iteration + 1)] break if as_pandas: diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index 325206bab..e29bd53b7 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -1,6 +1,9 @@ #!/bin/bash if [ ${TASK} == "lint" ]; then + source activate python3 + conda install numpy scipy + python -m pip install cpplint pylint astroid make lint || exit -1 echo "Check documentations..." diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh index 10e833060..751f6419d 100755 --- a/tests/travis/setup.sh +++ b/tests/travis/setup.sh @@ -1,7 +1,19 @@ #!/bin/bash if [ ${TASK} == "lint" ]; then - pip install --user cpplint 'pylint==1.4.4' 'astroid==1.3.6' + if [ ${TRAVIS_OS_NAME} == "osx" ]; then + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + else + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh + fi + bash conda.sh -b -p $HOME/miniconda + export PATH="$HOME/miniconda/bin:$PATH" + hash -r + conda config --set always_yes yes --set changeps1 no + conda update -q conda + # Useful for debugging any issues with conda + conda info -a + conda create -n python3 python=3.7 fi @@ -18,6 +30,6 @@ if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_lightweight_test" ] || [ conda update -q conda # Useful for debugging any issues with conda conda info -a - conda create -n python3 python=3.5 + conda create -n python3 python=3.7 conda create -n python2 python=2.7 fi