diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index a45845b13..5e1dd69b5 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -134,14 +134,16 @@ def reset_learning_rate(learning_rates):
 
         if context == 'train':
             bst, i, n = env.model, env.iteration, env.end_iteration
-            bst.set_param('learning_rate', get_learning_rate(i, n, learning_rates))
+            bst.set_param(
+                'learning_rate', get_learning_rate(i, n, learning_rates))
         elif context == 'cv':
             i, n = env.iteration, env.end_iteration
             for cvpack in env.cvfolds:
                 bst = cvpack.bst
-                bst.set_param('learning_rate', get_learning_rate(i, n, learning_rates))
+                bst.set_param(
+                    'learning_rate', get_learning_rate(i, n, learning_rates))
 
-    callback.before_iteration = True
+    callback.before_iteration = False
     return callback
 
 
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index f76a607d0..de3761968 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -4,7 +4,6 @@
 """Training Library containing training routines."""
 from __future__ import absolute_import
 
-import warnings
 import numpy as np
 from .core import Booster, STRING_TYPES, XGBoostError, CallbackEnv, EarlyStopException
 from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold)
@@ -54,9 +53,11 @@ def _train_internal(params, dtrain,
     nboost += start_iteration
 
     callbacks_before_iter = [
-        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
+        cb for cb in callbacks
+        if cb.__dict__.get('before_iteration', False)]
     callbacks_after_iter = [
-        cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
+        cb for cb in callbacks
+        if not cb.__dict__.get('before_iteration', False)]
 
     for i in range(start_iteration, num_boost_round):
         for cb in callbacks_before_iter:
@@ -113,7 +114,7 @@ def _train_internal(params, dtrain,
 
 def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
           maximize=False, early_stopping_rounds=None, evals_result=None,
-          verbose_eval=True, xgb_model=None, callbacks=None, learning_rates=None):
+          verbose_eval=True, xgb_model=None, callbacks=None):
     # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
     """Train a booster with given parameters.
 
@@ -169,11 +170,6 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
         / the boosting stage found by using **early_stopping_rounds** is also printed.
         Example: with ``verbose_eval=4`` and at least one item in **evals**, an evaluation metric
         is printed every 4 boosting stages, instead of every boosting stage.
-    learning_rates: list or function (deprecated - use callback API instead)
-        List of learning rate for each boosting round
-        or a customized function that calculates eta in terms of
-        current number of round and the total number of boosting round (e.g. yields
-        learning rate decay)
     xgb_model : file name of stored xgb model or 'Booster' instance
         Xgb model to be loaded before training (allows training continuation).
     callbacks : list of callback functions
@@ -206,11 +202,6 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
     if evals_result is not None:
         callbacks.append(callback.record_evaluation(evals_result))
 
-    if learning_rates is not None:
-        warnings.warn("learning_rates parameter is deprecated - use callback API instead",
-                      DeprecationWarning)
-        callbacks.append(callback.reset_learning_rate(learning_rates))
-
     return _train_internal(params, dtrain,
                            num_boost_round=num_boost_round,
                            evals=evals,
@@ -485,9 +476,11 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
             callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
 
     callbacks_before_iter = [
-        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
+        cb for cb in callbacks if
+        cb.__dict__.get('before_iteration', False)]
     callbacks_after_iter = [
-        cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
+        cb for cb in callbacks if
+        not cb.__dict__.get('before_iteration', False)]
 
     for i in range(num_boost_round):
         for cb in callbacks_before_iter:
diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py
new file mode 100644
index 000000000..04a3eecaf
--- /dev/null
+++ b/tests/python-gpu/test_gpu_basic_models.py
@@ -0,0 +1,14 @@
+import sys
+import unittest
+import numpy as np
+sys.path.append("tests/python")
+# Don't import the test class, otherwise they will run twice.
+import test_basic_models as test_bm  # noqa
+rng = np.random.RandomState(1994)
+
+
+class TestGPUBasicModels(unittest.TestCase):
+    cputest = test_bm.TestModels()
+
+    def test_eta_decay_gpu_hist(self):
+        self.cputest.run_eta_decay('gpu_hist')
diff --git a/tests/python-gpu/test_gpu_interaction_constraints.py b/tests/python-gpu/test_gpu_interaction_constraints.py
index 0a135776d..a6d3ff2f8 100644
--- a/tests/python-gpu/test_gpu_interaction_constraints.py
+++ b/tests/python-gpu/test_gpu_interaction_constraints.py
@@ -3,7 +3,7 @@ import unittest
 import sys
 sys.path.append("tests/python")
 # Don't import the test class, otherwise they will run twice.
-import test_interaction_constraints as test_ic
+import test_interaction_constraints as test_ic  # noqa
 rng = np.random.RandomState(1994)
 
 
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index 089b63d09..267078f22 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -88,50 +88,82 @@ class TestModels(unittest.TestCase):
                 assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0
         os.remove(model_path)
 
-    def test_eta_decay(self):
+    def run_eta_decay(self, tree_method):
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         num_round = 4
 
         # learning_rates as a list
         # init eta with 0 to check whether learning_rates work
         param = {'max_depth': 2, 'eta': 0, 'verbosity': 0,
-                 'objective': 'binary:logistic'}
+                 'objective': 'binary:logistic', 'tree_method': tree_method}
         evals_result = {}
-        bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=[0.8, 0.7, 0.6, 0.5],
+        bst = xgb.train(param, dtrain, num_round, watchlist,
+                        callbacks=[xgb.callback.reset_learning_rate([
+                            0.8, 0.7, 0.6, 0.5
+                        ])],
                         evals_result=evals_result)
-        eval_errors = list(map(float, evals_result['eval']['error']))
+        eval_errors_0 = list(map(float, evals_result['eval']['error']))
         assert isinstance(bst, xgb.core.Booster)
         # validation error should decrease, if eta > 0
-        assert eval_errors[0] > eval_errors[-1]
+        assert eval_errors_0[0] > eval_errors_0[-1]
 
         # init learning_rate with 0 to check whether learning_rates work
         param = {'max_depth': 2, 'learning_rate': 0, 'verbosity': 0,
-                 'objective': 'binary:logistic'}
+                 'objective': 'binary:logistic', 'tree_method': tree_method}
         evals_result = {}
-        bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=[0.8, 0.7, 0.6, 0.5],
+        bst = xgb.train(param, dtrain, num_round, watchlist,
+                        callbacks=[xgb.callback.reset_learning_rate(
+                            [0.8, 0.7, 0.6, 0.5])],
                         evals_result=evals_result)
-        eval_errors = list(map(float, evals_result['eval']['error']))
+        eval_errors_1 = list(map(float, evals_result['eval']['error']))
         assert isinstance(bst, xgb.core.Booster)
         # validation error should decrease, if learning_rate > 0
-        assert eval_errors[0] > eval_errors[-1]
+        assert eval_errors_1[0] > eval_errors_1[-1]
 
         # check if learning_rates override default value of eta/learning_rate
-        param = {'max_depth': 2, 'verbosity': 0, 'objective': 'binary:logistic'}
+        param = {
+            'max_depth': 2, 'verbosity': 0, 'objective': 'binary:logistic',
+            'tree_method': tree_method
+        }
         evals_result = {}
-        bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=[0, 0, 0, 0],
+        bst = xgb.train(param, dtrain, num_round, watchlist,
+                        callbacks=[xgb.callback.reset_learning_rate(
+                            [0, 0, 0, 0]
+                        )],
                         evals_result=evals_result)
-        eval_errors = list(map(float, evals_result['eval']['error']))
+        eval_errors_2 = list(map(float, evals_result['eval']['error']))
         assert isinstance(bst, xgb.core.Booster)
         # validation error should not decrease, if eta/learning_rate = 0
-        assert eval_errors[0] == eval_errors[-1]
+        assert eval_errors_2[0] == eval_errors_2[-1]
 
         # learning_rates as a customized decay function
         def eta_decay(ithround, num_boost_round):
             return num_boost_round / (ithround + 1)
 
-        bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=eta_decay)
+        evals_result = {}
+        bst = xgb.train(param, dtrain, num_round, watchlist,
+                        callbacks=[
+                            xgb.callback.reset_learning_rate(eta_decay)
+                        ],
+                        evals_result=evals_result)
+        eval_errors_3 = list(map(float, evals_result['eval']['error']))
+
         assert isinstance(bst, xgb.core.Booster)
 
+        assert eval_errors_3[0] == eval_errors_2[0]
+
+        for i in range(1, len(eval_errors_0)):
+            assert eval_errors_3[i] != eval_errors_2[i]
+
+    def test_eta_decay_hist(self):
+        self.run_eta_decay('hist')
+
+    def test_eta_decay_approx(self):
+        self.run_eta_decay('approx')
+
+    def test_eta_decay_exact(self):
+        self.run_eta_decay('exact')
+
     def test_boost_from_prediction(self):
         # Re-construct dtrain here to avoid modification
         margined = xgb.DMatrix(dpath + 'agaricus.txt.train')