diff --git a/jvm-packages/dev/build.sh b/jvm-packages/dev/build.sh
index d0571fbf1..db883908f 100755
--- a/jvm-packages/dev/build.sh
+++ b/jvm-packages/dev/build.sh
@@ -17,5 +17,5 @@ rm /usr/bin/python
 ln -s /opt/rh/python27/root/usr/bin/python /usr/bin/python
 
 # build xgboost
-cd /xgboost/jvm-packages;mvn package
+cd /xgboost/jvm-packages;ulimit -c unlimited;mvn package
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
index 59b7d1b05..d5ecb3eca 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -80,7 +80,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     val trainingRDD = sc.parallelize(Classification.train)
     val (booster, metrics) = XGBoost.trainDistributed(
       trainingRDD,
-      List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      List("eta" -> "1", "max_depth" -> "6",
         "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
         "custom_eval" -> null, "custom_obj" -> null, "use_external_memory" -> false,
         "missing" -> Float.NaN).toMap,
@@ -92,7 +92,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
       "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
       "use_external_memory" -> true)
     val model = new XGBoostClassifier(paramMap).fit(training)
@@ -104,54 +104,78 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
       "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
       "tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
     val model = new XGBoostClassifier(paramMap).fit(training)
     assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
   }
 
-  test("test with fast histo with monotone_constraints") {
+  test("test with quantile hist with monotone_constraints (lossguide)") {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
     val paramMap = Map("eta" -> "1",
-      "max_depth" -> "6", "silent" -> "1",
+      "max_depth" -> "6",
+      "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide",
+      "num_round" -> 5, "num_workers" -> numWorkers, "monotone_constraints" -> "(1, 0)")
+    val model = new XGBoostClassifier(paramMap).fit(training)
+    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
+  }
+
+  test("test with quantile hist with interaction_constraints (lossguide)") {
+    val eval = new EvalError()
+    val training = buildDataFrame(Classification.train)
+    val testDM = new DMatrix(Classification.test.iterator)
+    val paramMap = Map("eta" -> "1",
+      "max_depth" -> "6",
+      "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide",
+      "num_round" -> 5, "num_workers" -> numWorkers, "interaction_constraints" -> "[[1,2],[2,3,4]]")
+    val model = new XGBoostClassifier(paramMap).fit(training)
+    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
+  }
+
+  test("test with quantile hist with monotone_constraints (depthwise)") {
+    val eval = new EvalError()
+    val training = buildDataFrame(Classification.train)
+    val testDM = new DMatrix(Classification.test.iterator)
+    val paramMap = Map("eta" -> "1",
+      "max_depth" -> "6",
       "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise",
       "num_round" -> 5, "num_workers" -> numWorkers, "monotone_constraints" -> "(1, 0)")
     val model = new XGBoostClassifier(paramMap).fit(training)
     assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
   }
 
-  test("test with fast histo with interaction_constraints") {
+  test("test with quantile hist with interaction_constraints (depthwise)") {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
     val paramMap = Map("eta" -> "1",
-      "max_depth" -> "6", "silent" -> "1",
+      "max_depth" -> "6",
       "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise",
       "num_round" -> 5, "num_workers" -> numWorkers, "interaction_constraints" -> "[[1,2],[2,3,4]]")
     val model = new XGBoostClassifier(paramMap).fit(training)
     assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
   }
 
-  test("test with fast histo depthwise") {
+  test("test with quantile hist depthwise") {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
     val paramMap = Map("eta" -> "1",
-      "max_depth" -> "6", "silent" -> "1",
+      "max_depth" -> "6",
       "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise",
       "num_round" -> 5, "num_workers" -> numWorkers)
     val model = new XGBoostClassifier(paramMap).fit(training)
     assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
   }
 
-  test("test with fast histo lossguide") {
+  test("test with quantile hist lossguide") {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "1",
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide",
       "max_leaves" -> "8", "num_round" -> 5,
       "num_workers" -> numWorkers)
@@ -160,11 +184,11 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     assert(x < 0.1)
   }
 
-  test("test with fast histo lossguide with max bin") {
+  test("test with quantile hist lossguide with max bin") {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
       "eval_metric" -> "error", "num_round" -> 5, "num_workers" -> numWorkers)
@@ -173,11 +197,11 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     assert(x < 0.1)
   }
 
-  test("test with fast histo depthwidth with max depth") {
+  test("test with quantile hist depthwidth with max depth") {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6", "silent" -> "0",
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_depth" -> "2",
       "eval_metric" -> "error", "num_round" -> 10, "num_workers" -> numWorkers)
@@ -186,11 +210,11 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     assert(x < 0.1)
   }
 
-  test("test with fast histo depthwidth with max depth and max bin") {
+  test("test with quantile hist depthwidth with max depth and max bin") {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6", "silent" -> "0",
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
       "eval_metric" -> "error", "num_round" -> 10, "num_workers" -> numWorkers)
@@ -217,7 +241,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     }
 
     val denseDF = buildDenseDataFrame().repartition(4)
-    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
+    val paramMap = List("eta" -> "1", "max_depth" -> "2",
       "objective" -> "binary:logistic", "missing" -> -0.1f, "num_workers" -> numWorkers).toMap
     val model = new XGBoostClassifier(paramMap).fit(denseDF)
     model.transform(denseDF).collect()
@@ -227,7 +251,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
     val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
       "objective" -> "binary:logistic", "timeout_request_workers" -> 0L,
       "num_round" -> 5, "num_workers" -> numWorkers)
     val model = new XGBoostClassifier(paramMap).fit(training)
@@ -241,7 +265,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     val testDM = new DMatrix(Classification.test.iterator)
 
     val tmpPath = Files.createTempDirectory("model1").toAbsolutePath.toString
-    val paramMap = Map("eta" -> "1", "max_depth" -> 2, "silent" -> "1",
+    val paramMap = Map("eta" -> "1", "max_depth" -> 2,
       "objective" -> "binary:logistic", "checkpoint_path" -> tmpPath,
       "checkpoint_interval" -> 2, "num_workers" -> numWorkers)
 
@@ -295,16 +319,17 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     val trainingRDD = sc.parallelize(Ranking.train, 5)
     val (booster, _) = XGBoost.trainDistributed(
       trainingRDD,
-      List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      List("eta" -> "1", "max_depth" -> "6",
         "objective" -> "rank:pairwise", "num_round" -> 5, "num_workers" -> numWorkers,
-        "missing" -> Float.NaN, "use_external_memory" -> false).toMap,
+        "custom_eval" -> null, "custom_obj" -> null, "use_external_memory" -> false,
+        "missing" -> Float.NaN).toMap,
       hasGroup = true)
 
     assert(booster != null)
   }
 
   test("training summary") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
       "objective" -> "binary:logistic", "num_round" -> 5, "nWorkers" -> numWorkers)
 
     val trainingDF = buildDataFrame(Classification.train)
@@ -316,7 +341,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
   }
 
   test("train/test split") {
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
       "objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
       "num_round" -> 5, "num_workers" -> numWorkers)
     val training = buildDataFrame(Classification.train)
@@ -332,7 +357,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
   test("train with multiple validation datasets (non-ranking)") {
     val training = buildDataFrame(Classification.train)
     val Array(train, eval1, eval2) = training.randomSplit(Array(0.6, 0.2, 0.2))
-    val paramMap1 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+    val paramMap1 = Map("eta" -> "1", "max_depth" -> "6",
       "objective" -> "binary:logistic",
       "num_round" -> 5, "num_workers" -> numWorkers)
 
@@ -345,7 +370,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
     assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
 
-    val paramMap2 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+    val paramMap2 = Map("eta" -> "1", "max_depth" -> "6",
       "objective" -> "binary:logistic",
       "num_round" -> 5, "num_workers" -> numWorkers,
       "eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
@@ -362,7 +387,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
   test("train with multiple validation datasets (ranking)") {
     val training = buildDataFrameWithGroup(Ranking.train, 5)
     val Array(train, eval1, eval2) = training.randomSplit(Array(0.6, 0.2, 0.2))
-    val paramMap1 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+    val paramMap1 = Map("eta" -> "1", "max_depth" -> "6",
       "objective" -> "rank:pairwise",
       "num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group")
     val xgb1 = new XGBoostRegressor(paramMap1).setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
@@ -375,7 +400,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(0))
     assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))
 
-    val paramMap2 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+    val paramMap2 = Map("eta" -> "1", "max_depth" -> "6",
       "objective" -> "rank:pairwise",
       "num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group",
       "eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index 41611edf6..f7b2ff8e3 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -342,7 +342,6 @@ public class BoosterImplTest {
   public void testBoosterEarlyStop() throws XGBoostError, IOException {
     DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
     DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
-    // testBoosterWithFastHistogram(trainMat, testMat);
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {
         put("max_depth", 3);
@@ -375,7 +374,7 @@ public class BoosterImplTest {
     }
   }
 
-  private void testWithFastHisto(DMatrix trainingSet, Map<String, DMatrix> watches, int round,
+  private void testWithQuantileHisto(DMatrix trainingSet, Map<String, DMatrix> watches, int round,
                                       Map<String, Object> paramMap, float threshold) throws XGBoostError {
     float[][] metrics = new float[watches.size()][round];
     Booster booster = XGBoost.train(trainingSet, paramMap, round, watches,
@@ -393,10 +392,9 @@ public class BoosterImplTest {
   }
 
   @Test
-  public void testFastHistoDepthWise() throws XGBoostError {
+  public void testQuantileHistoDepthWise() throws XGBoostError {
     DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
     DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
-    // testBoosterWithFastHistogram(trainMat, testMat);
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {
         put("max_depth", 3);
@@ -410,14 +408,13 @@ public class BoosterImplTest {
     Map<String, DMatrix> watches = new HashMap<>();
     watches.put("training", trainMat);
     watches.put("test", testMat);
-    testWithFastHisto(trainMat, watches, 10, paramMap, 0.0f);
+    testWithQuantileHisto(trainMat, watches, 10, paramMap, 0.95f);
   }
 
   @Test
-  public void testFastHistoLossGuide() throws XGBoostError {
+  public void testQuantileHistoLossGuide() throws XGBoostError {
     DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
     DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
-    // testBoosterWithFastHistogram(trainMat, testMat);
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {
         put("max_depth", 0);
@@ -432,14 +429,13 @@ public class BoosterImplTest {
     Map<String, DMatrix> watches = new HashMap<>();
     watches.put("training", trainMat);
     watches.put("test", testMat);
-    testWithFastHisto(trainMat, watches, 10, paramMap, 0.0f);
+    testWithQuantileHisto(trainMat, watches, 10, paramMap, 0.95f);
   }
 
   @Test
-  public void testFastHistoLossGuideMaxBin() throws XGBoostError {
+  public void testQuantileHistoLossGuideMaxBin() throws XGBoostError {
     DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
     DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
-    // testBoosterWithFastHistogram(trainMat, testMat);
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {
         put("max_depth", 0);
@@ -454,7 +450,7 @@ public class BoosterImplTest {
     };
     Map<String, DMatrix> watches = new HashMap<>();
     watches.put("training", trainMat);
-    testWithFastHisto(trainMat, watches, 10, paramMap, 0.0f);
+    testWithQuantileHisto(trainMat, watches, 10, paramMap, 0.95f);
   }
 
   @Test
@@ -534,38 +530,33 @@ public class BoosterImplTest {
   }
 
   @Test
-  public void testFastHistoDepthwiseMaxDepth() throws XGBoostError {
+  public void testQuantileHistoDepthwiseMaxDepth() throws XGBoostError {
     DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
-    // testBoosterWithFastHistogram(trainMat, testMat);
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {
         put("max_depth", 3);
         put("silent", 1);
         put("objective", "binary:logistic");
         put("tree_method", "hist");
-        put("max_depth", 2);
         put("grow_policy", "depthwise");
         put("eval_metric", "auc");
       }
     };
     Map<String, DMatrix> watches = new HashMap<>();
     watches.put("training", trainMat);
-    testWithFastHisto(trainMat, watches, 10, paramMap, 0.85f);
+    testWithQuantileHisto(trainMat, watches, 10, paramMap, 0.95f);
   }
 
   @Test
-  public void testFastHistoDepthwiseMaxDepthMaxBin() throws XGBoostError {
+  public void testQuantileHistoDepthwiseMaxDepthMaxBin() throws XGBoostError {
     DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
     DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
-    // testBoosterWithFastHistogram(trainMat, testMat);
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {
         put("max_depth", 3);
         put("silent", 1);
         put("objective", "binary:logistic");
         put("tree_method", "hist");
-        put("max_depth", 2);
         put("max_bin", 2);
         put("grow_policy", "depthwise");
         put("eval_metric", "auc");
@@ -573,7 +564,7 @@ public class BoosterImplTest {
     };
     Map<String, DMatrix> watches = new HashMap<>();
     watches.put("training", trainMat);
-    testWithFastHisto(trainMat, watches, 10, paramMap, 0.85f);
+    testWithQuantileHisto(trainMat, watches, 10, paramMap, 0.95f);
   }
 
   /**
diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
index 1791c4240..a53a5cd29 100644
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
+++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
@@ -77,7 +77,7 @@ class ScalaBoosterImplSuite extends FunSuite {
     XGBoost.train(trainMat, paramMap, round, watches)
   }
 
-  private def trainBoosterWithFastHisto(
+  private def trainBoosterWithQuantileHisto(
       trainMat: DMatrix,
       watches: Map[String, DMatrix],
       round: Int,
@@ -146,57 +146,57 @@ class ScalaBoosterImplSuite extends FunSuite {
     XGBoost.crossValidation(trainMat, params, round, nfold)
   }
 
-  test("test with fast histo depthwise") {
+  test("test with quantile histo depthwise") {
     val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
     val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
     val paramMap = List("max_depth" -> "3", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "eval_metric" -> "auc").toMap
-    trainBoosterWithFastHisto(trainMat, Map("training" -> trainMat, "test" -> testMat),
-      round = 10, paramMap, 0.0f)
+    trainBoosterWithQuantileHisto(trainMat, Map("training" -> trainMat, "test" -> testMat),
+      round = 10, paramMap, 0.95f)
   }
 
-  test("test with fast histo lossguide") {
+  test("test with quantile histo lossguide") {
     val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
     val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "auc").toMap
-    trainBoosterWithFastHisto(trainMat, Map("training" -> trainMat, "test" -> testMat),
-      round = 10, paramMap, 0.0f)
+    trainBoosterWithQuantileHisto(trainMat, Map("training" -> trainMat, "test" -> testMat),
+      round = 10, paramMap, 0.95f)
   }
 
-  test("test with fast histo lossguide with max bin") {
+  test("test with quantile histo lossguide with max bin") {
     val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
     val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
       "eval_metric" -> "auc").toMap
-    trainBoosterWithFastHisto(trainMat, Map("training" -> trainMat),
-      round = 10, paramMap, 0.0f)
+    trainBoosterWithQuantileHisto(trainMat, Map("training" -> trainMat),
+      round = 10, paramMap, 0.95f)
   }
 
-  test("test with fast histo depthwidth with max depth") {
+  test("test with quantile histo depthwidth with max depth") {
     val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
     val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_leaves" -> "8", "max_depth" -> "2",
       "eval_metric" -> "auc").toMap
-    trainBoosterWithFastHisto(trainMat, Map("training" -> trainMat),
-      round = 10, paramMap, 0.85f)
+    trainBoosterWithQuantileHisto(trainMat, Map("training" -> trainMat),
+      round = 10, paramMap, 0.95f)
   }
 
-  test("test with fast histo depthwidth with max depth and max bin") {
+  test("test with quantile histo depthwidth with max depth and max bin") {
     val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
     val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
       "eval_metric" -> "auc").toMap
-    trainBoosterWithFastHisto(trainMat, Map("training" -> trainMat),
-      round = 10, paramMap, 0.85f)
+    trainBoosterWithQuantileHisto(trainMat, Map("training" -> trainMat),
+      round = 10, paramMap, 0.95f)
   }
 
   test("test training from existing model in scala") {
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 0bde67f7d..422265a15 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -1,8 +1,6 @@
 /*!
- * Copyright 2017-2018 by Contributors
+ * Copyright 2017-2019 by Contributors
  * \file hist_util.h
- * \brief Utilities to store histograms
- * \author Philip Cho, Tianqi Chen
  */
 #include <rabit/rabit.h>
 #include <dmlc/omp.h>
@@ -161,6 +159,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
       SparsePage::Inst inst = batch[i];
 
       CHECK_EQ(ibegin + inst.size(), iend);
+
       for (bst_uint j = 0; j < inst.size(); ++j) {
         uint32_t idx = cut.GetBinIdx(inst[j]);
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index d96ca0a08..f479e85cc 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -73,8 +73,7 @@ void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair,
         std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone())));
   }
   for (auto tree : trees) {
-    builder_->Update
-        (gmat_, gmatb_, column_matrix_, gpair, dmat, tree);
+    builder_->Update(gmat_, gmatb_, column_matrix_, gpair, dmat, tree);
   }
   param_.learning_rate = lr;
 }
@@ -89,120 +88,275 @@ bool QuantileHistMaker::UpdatePredictionCache(
   }
 }
 
-void QuantileHistMaker::Builder::Update(const GHistIndexMatrix& gmat,
-                                        const GHistIndexBlockMatrix& gmatb,
-                                        const ColumnMatrix& column_matrix,
-                                        HostDeviceVector<GradientPair>* gpair,
-                                        DMatrix* p_fmat,
-                                        RegTree* p_tree) {
-  double gstart = dmlc::GetTime();
+void QuantileHistMaker::Builder::SyncHistograms(
+    int starting_index,
+    int sync_count,
+    RegTree *p_tree) {
+  perf_monitor.TickStart();
+  this->histred_.Allreduce(hist_[starting_index].data(), hist_builder_.GetNumBins() * sync_count);
+  // use Subtraction Trick
+  for (auto local_it = nodes_for_subtraction_trick_.begin();
+    local_it != nodes_for_subtraction_trick_.end(); local_it++) {
+    hist_.AddHistRow(local_it->first);
+    SubtractionTrick(hist_[local_it->first], hist_[local_it->second],
+                     hist_[(*p_tree)[local_it->first].Parent()]);
+  }
+  perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::BUILD_HIST);
+}
 
-  int num_leaves = 0;
-  unsigned timestamp = 0;
+void QuantileHistMaker::Builder::BuildLocalHistograms(
+    int *starting_index,
+    int *sync_count,
+    const GHistIndexMatrix &gmat,
+    const GHistIndexBlockMatrix &gmatb,
+    RegTree *p_tree,
+    const std::vector<GradientPair> &gpair_h) {
+  perf_monitor.TickStart();
+  for (size_t k = 0; k < qexpand_depth_wise_.size(); k++) {
+    int nid = qexpand_depth_wise_[k].nid;
+    RegTree::Node &node = (*p_tree)[nid];
+    if (rabit::IsDistributed()) {
+      if (node.IsRoot() || node.IsLeftChild()) {
+        // in distributed setting, we always calcuate from left child or root node
+        hist_.AddHistRow(nid);
+        BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false);
+        if (!node.IsRoot()) {
+          nodes_for_subtraction_trick_[(*p_tree)[node.Parent()].RightChild()] = nid;
+        }
+        (*sync_count)++;
+        (*starting_index) = std::min((*starting_index), nid);
+      }
+    } else {
+      if (!node.IsRoot() && node.IsLeftChild() &&
+          (row_set_collection_[nid].Size() <
+           row_set_collection_[(*p_tree)[node.Parent()].RightChild()].Size())) {
+        hist_.AddHistRow(nid);
+        BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false);
+        nodes_for_subtraction_trick_[(*p_tree)[node.Parent()].RightChild()] = nid;
+        (*sync_count)++;
+        (*starting_index) = std::min((*starting_index), nid);
+      } else if (!node.IsRoot() && !node.IsLeftChild() &&
+                 (row_set_collection_[nid].Size() <=
+                  row_set_collection_[(*p_tree)[node.Parent()].LeftChild()].Size())) {
+        hist_.AddHistRow(nid);
+        BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false);
+        nodes_for_subtraction_trick_[(*p_tree)[node.Parent()].LeftChild()] = nid;
+        (*sync_count)++;
+        (*starting_index) = std::min((*starting_index), nid);
+      } else if (node.IsRoot()) {
+        // root node
+        hist_.AddHistRow(nid);
+        BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], false);
+        (*sync_count)++;
+        (*starting_index) = std::min((*starting_index), nid);
+      }
+    }
+  }
+  perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::BUILD_HIST);
+}
 
-  double tstart;
-  double time_init_data = 0;
-  double time_init_new_node = 0;
-  double time_build_hist = 0;
-  double time_evaluate_split = 0;
-  double time_apply_split = 0;
-
-  const std::vector<GradientPair>& gpair_h = gpair->ConstHostVector();
-
-  spliteval_->Reset();
-
-  tstart = dmlc::GetTime();
-  this->InitData(gmat, gpair_h, *p_fmat, *p_tree);
-  time_init_data = dmlc::GetTime() - tstart;
-
-  // FIXME(hcho3): this code is broken when param.num_roots > 1. Please fix it
-  CHECK_EQ(p_tree->param.num_roots, 1)
-      << "tree_method=hist does not support multiple roots at this moment";
-  for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
-    tstart = dmlc::GetTime();
-    hist_.AddHistRow(nid);
-    BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid]);
-    time_build_hist += dmlc::GetTime() - tstart;
-
-    tstart = dmlc::GetTime();
+void QuantileHistMaker::Builder::BuildNodeStats(
+    const GHistIndexMatrix &gmat,
+    DMatrix *p_fmat,
+    RegTree *p_tree,
+    const std::vector<GradientPair> &gpair_h) {
+  perf_monitor.TickStart();
+  for (size_t k = 0; k < qexpand_depth_wise_.size(); k++) {
+    int nid = qexpand_depth_wise_[k].nid;
     this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
-    time_init_new_node += dmlc::GetTime() - tstart;
+    // add constraints
+    if (!(*p_tree)[nid].IsLeftChild() && !(*p_tree)[nid].IsRoot()) {
+      // it's a right child
+      auto parent_id = (*p_tree)[nid].Parent();
+      auto left_sibling_id = (*p_tree)[parent_id].LeftChild();
+      auto parent_split_feature_id = snode_[parent_id].best.SplitIndex();
+      spliteval_->AddSplit(parent_id, left_sibling_id, nid, parent_split_feature_id,
+                           snode_[left_sibling_id].weight, snode_[nid].weight);
+    }
+  }
+  perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::INIT_NEW_NODE);
+}
 
-    tstart = dmlc::GetTime();
+void QuantileHistMaker::Builder::EvaluateSplits(
+    const GHistIndexMatrix &gmat,
+    const ColumnMatrix &column_matrix,
+    DMatrix *p_fmat,
+    RegTree *p_tree,
+    int *num_leaves,
+    int depth,
+    unsigned *timestamp,
+    std::vector<ExpandEntry> *temp_qexpand_depth) {
+  for (size_t k = 0; k < qexpand_depth_wise_.size(); k++) {
+    int nid = qexpand_depth_wise_[k].nid;
+    perf_monitor.TickStart();
     this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree);
-    time_evaluate_split += dmlc::GetTime() - tstart;
-    qexpand_->push(ExpandEntry(nid, p_tree->GetDepth(nid),
+    perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::EVALUATE_SPLIT);
+    if (snode_[nid].best.loss_chg < kRtEps ||
+        (param_.max_depth > 0 && depth == param_.max_depth) ||
+        (param_.max_leaves > 0 && (*num_leaves) == param_.max_leaves)) {
+      (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
+    } else {
+      perf_monitor.TickStart();
+      this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree);
+      perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::APPLY_SPLIT);
+      int left_id = (*p_tree)[nid].LeftChild();
+      int right_id = (*p_tree)[nid].RightChild();
+      temp_qexpand_depth->push_back(ExpandEntry(left_id,
+                                                p_tree->GetDepth(left_id), 0.0, (*timestamp)++));
+      temp_qexpand_depth->push_back(ExpandEntry(right_id,
+                                                p_tree->GetDepth(right_id), 0.0, (*timestamp)++));
+      // - 1 parent + 2 new children
+      (*num_leaves)++;
+    }
+  }
+}
+
+
+
+void QuantileHistMaker::Builder::ExpandWithDepthWidth(
+  const GHistIndexMatrix &gmat,
+  const GHistIndexBlockMatrix &gmatb,
+  const ColumnMatrix &column_matrix,
+  DMatrix *p_fmat,
+  RegTree *p_tree,
+  const std::vector<GradientPair> &gpair_h) {
+  unsigned timestamp = 0;
+  int num_leaves = 0;
+
+  // in depth_wise growing, we feed loss_chg with 0.0 since it is not used anyway
+  qexpand_depth_wise_.push_back(ExpandEntry(0, p_tree->GetDepth(0), 0.0, timestamp++));
+  ++num_leaves;
+  for (int depth = 0; depth < param_.max_depth + 1; depth++) {
+    int starting_index = std::numeric_limits<int>::max();
+    int sync_count = 0;
+    std::vector<ExpandEntry> temp_qexpand_depth;
+    BuildLocalHistograms(&starting_index, &sync_count, gmat, gmatb, p_tree, gpair_h);
+    SyncHistograms(starting_index, sync_count, p_tree);
+    BuildNodeStats(gmat, p_fmat, p_tree, gpair_h);
+    EvaluateSplits(gmat, column_matrix, p_fmat, p_tree, &num_leaves, depth, &timestamp,
+            &temp_qexpand_depth);
+    // clean up
+    qexpand_depth_wise_.clear();
+    nodes_for_subtraction_trick_.clear();
+    if (temp_qexpand_depth.empty()) {
+      break;
+    } else {
+      qexpand_depth_wise_ = temp_qexpand_depth;
+      temp_qexpand_depth.clear();
+    }
+  }
+}
+
+void QuantileHistMaker::Builder::ExpandWithLossGuide(
+    const GHistIndexMatrix& gmat,
+    const GHistIndexBlockMatrix& gmatb,
+    const ColumnMatrix& column_matrix,
+    DMatrix* p_fmat,
+    RegTree* p_tree,
+    const std::vector<GradientPair>& gpair_h) {
+
+  unsigned timestamp = 0;
+  int num_leaves = 0;
+
+  for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
+    perf_monitor.TickStart();
+    hist_.AddHistRow(nid);
+    BuildHist(gpair_h, row_set_collection_[nid], gmat, gmatb, hist_[nid], true);
+    perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::BUILD_HIST);
+
+    perf_monitor.TickStart();
+    this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
+    perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::INIT_NEW_NODE);
+
+    perf_monitor.TickStart();
+    this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree);
+    perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::EVALUATE_SPLIT);
+    qexpand_loss_guided_->push(ExpandEntry(nid, p_tree->GetDepth(nid),
                                snode_[nid].best.loss_chg,
                                timestamp++));
     ++num_leaves;
   }
 
-  while (!qexpand_->empty()) {
-    const ExpandEntry candidate = qexpand_->top();
+  while (!qexpand_loss_guided_->empty()) {
+    const ExpandEntry candidate = qexpand_loss_guided_->top();
     const int nid = candidate.nid;
-    qexpand_->pop();
+    qexpand_loss_guided_->pop();
     if (candidate.loss_chg <= kRtEps
         || (param_.max_depth > 0 && candidate.depth == param_.max_depth)
         || (param_.max_leaves > 0 && num_leaves == param_.max_leaves) ) {
       (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
     } else {
-      tstart = dmlc::GetTime();
+      perf_monitor.TickStart();
       this->ApplySplit(nid, gmat, column_matrix, hist_, *p_fmat, p_tree);
-      time_apply_split += dmlc::GetTime() - tstart;
+      perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::APPLY_SPLIT);
 
-      tstart = dmlc::GetTime();
+      perf_monitor.TickStart();
       const int cleft = (*p_tree)[nid].LeftChild();
       const int cright = (*p_tree)[nid].RightChild();
       hist_.AddHistRow(cleft);
       hist_.AddHistRow(cright);
       if (rabit::IsDistributed()) {
         // in distributed mode, we need to keep consistent across workers
-        BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft]);
+        BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft], true);
         SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
       } else {
         if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
-          BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft]);
+          BuildHist(gpair_h, row_set_collection_[cleft], gmat, gmatb, hist_[cleft], true);
           SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
         } else {
-          BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright]);
+          BuildHist(gpair_h, row_set_collection_[cright], gmat, gmatb, hist_[cright], true);
           SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
         }
       }
-      time_build_hist += dmlc::GetTime() - tstart;
+      perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::BUILD_HIST);
 
-      tstart = dmlc::GetTime();
+      perf_monitor.TickStart();
       this->InitNewNode(cleft, gmat, gpair_h, *p_fmat, *p_tree);
       this->InitNewNode(cright, gmat, gpair_h, *p_fmat, *p_tree);
       bst_uint featureid = snode_[nid].best.SplitIndex();
       spliteval_->AddSplit(nid, cleft, cright, featureid,
                            snode_[cleft].weight, snode_[cright].weight);
-      time_init_new_node += dmlc::GetTime() - tstart;
+      perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::APPLY_SPLIT);
 
-      tstart = dmlc::GetTime();
+      perf_monitor.TickStart();
       this->EvaluateSplit(cleft, gmat, hist_, *p_fmat, *p_tree);
       this->EvaluateSplit(cright, gmat, hist_, *p_fmat, *p_tree);
-      time_evaluate_split += dmlc::GetTime() - tstart;
+      perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::EVALUATE_SPLIT);
 
-      qexpand_->push(ExpandEntry(cleft, p_tree->GetDepth(cleft),
+      qexpand_loss_guided_->push(ExpandEntry(cleft, p_tree->GetDepth(cleft),
                                  snode_[cleft].best.loss_chg,
                                  timestamp++));
-      qexpand_->push(ExpandEntry(cright, p_tree->GetDepth(cright),
+      qexpand_loss_guided_->push(ExpandEntry(cright, p_tree->GetDepth(cright),
                                  snode_[cright].best.loss_chg,
                                  timestamp++));
 
       ++num_leaves;  // give two and take one, as parent is no longer a leaf
     }
   }
+}
 
-  // set all the rest expanding nodes to leaf
-  // This post condition is not needed in current code, but may be necessary
-  // when there are stopping rule that leaves qexpand non-empty
-  while (!qexpand_->empty()) {
-    const int nid = qexpand_->top().nid;
-    qexpand_->pop();
-    (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
+void QuantileHistMaker::Builder::Update(const GHistIndexMatrix& gmat,
+                                        const GHistIndexBlockMatrix& gmatb,
+                                        const ColumnMatrix& column_matrix,
+                                        HostDeviceVector<GradientPair>* gpair,
+                                        DMatrix* p_fmat,
+                                        RegTree* p_tree) {
+  perf_monitor.StartPerfMonitor();
+
+  const std::vector<GradientPair>& gpair_h = gpair->ConstHostVector();
+
+  spliteval_->Reset();
+
+  perf_monitor.TickStart();
+  this->InitData(gmat, gpair_h, *p_fmat, *p_tree);
+  perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::INIT_DATA);
+
+  if (param_.grow_policy == TrainParam::kLossGuide) {
+    ExpandWithLossGuide(gmat, gmatb, column_matrix, p_fmat, p_tree, gpair_h);
+  } else {
+    ExpandWithDepthWidth(gmat, gmatb, column_matrix, p_fmat, p_tree, gpair_h);
   }
-  // remember auxiliary statistics in the tree node
+
   for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
     p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
     p_tree->Stat(nid).base_weight = snode_[nid].weight;
@@ -211,30 +365,7 @@ void QuantileHistMaker::Builder::Update(const GHistIndexMatrix& gmat,
 
   pruner_->Update(gpair, p_fmat, std::vector<RegTree*>{p_tree});
 
-  double total_time = dmlc::GetTime() - gstart;
-  LOG(INFO) << "\nInitData:          "
-            << std::fixed << std::setw(6) << std::setprecision(4) << time_init_data
-            << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-            << time_init_data / total_time * 100 << "%)\n"
-            << "InitNewNode:       "
-            << std::fixed << std::setw(6) << std::setprecision(4) << time_init_new_node
-            << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-            << time_init_new_node / total_time * 100 << "%)\n"
-            << "BuildHist:         "
-            << std::fixed << std::setw(6) << std::setprecision(4) << time_build_hist
-            << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-            << time_build_hist / total_time * 100 << "%)\n"
-            << "EvaluateSplit:     "
-            << std::fixed << std::setw(6) << std::setprecision(4) << time_evaluate_split
-            << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-            << time_evaluate_split / total_time * 100 << "%)\n"
-            << "ApplySplit:        "
-            << std::fixed << std::setw(6) << std::setprecision(4) << time_apply_split
-            << " (" << std::fixed << std::setw(5) << std::setprecision(2)
-            << time_apply_split / total_time * 100 << "%)\n"
-            << "========================================\n"
-            << "Total:             "
-            << std::fixed << std::setw(6) << std::setprecision(4) << total_time;
+  perf_monitor.EndPerfMonitor();
 }
 
 bool QuantileHistMaker::Builder::UpdatePredictionCache(
@@ -353,14 +484,13 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
     p_last_tree_ = &tree;
     // store a pointer to training data
     p_last_fmat_ = &fmat;
-    // initialize feature index
-    if (data_layout_ == kDenseDataOneBased) {
-      column_sampler_.Init(info.num_col_, param_.colsample_bynode,
-                           param_.colsample_bylevel, param_.colsample_bytree, true);
-    } else {
-      column_sampler_.Init(info.num_col_, param_.colsample_bynode,
-                           param_.colsample_bylevel, param_.colsample_bytree,  false);
-    }
+  }
+  if (data_layout_ == kDenseDataOneBased) {
+    column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel,
+            param_.colsample_bytree, true);
+  } else {
+    column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel,
+            param_.colsample_bytree,  false);
   }
   if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
     /* specialized code for dense data:
@@ -387,9 +517,9 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
   }
   {
     if (param_.grow_policy == TrainParam::kLossGuide) {
-      qexpand_.reset(new ExpandQueue(LossGuide));
+      qexpand_loss_guided_.reset(new ExpandQueue(LossGuide));
     } else {
-      qexpand_.reset(new ExpandQueue(DepthWise));
+      qexpand_depth_wise_.clear();
     }
   }
 }
diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h
index 8e68aea44..ba31dc55a 100644
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -7,6 +7,7 @@
 #ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
 #define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
 
+#include <dmlc/timer.h>
 #include <rabit/rabit.h>
 #include <xgboost/tree_updater.h>
 
@@ -14,6 +15,7 @@
 #include <vector>
 #include <string>
 #include <queue>
+#include <iomanip>
 #include <utility>
 
 #include "./param.h"
@@ -97,13 +99,16 @@ class QuantileHistMaker: public TreeUpdater {
                           const RowSetCollection::Elem row_indices,
                           const GHistIndexMatrix& gmat,
                           const GHistIndexBlockMatrix& gmatb,
-                          GHistRow hist) {
+                          GHistRow hist,
+                          bool sync_hist) {
       if (param_.enable_feature_grouping > 0) {
         hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, hist);
       } else {
         hist_builder_.BuildHist(gpair, row_indices, gmat, hist);
       }
-      this->histred_.Allreduce(hist.data(), hist_builder_.GetNumBins());
+      if (sync_hist) {
+        this->histred_.Allreduce(hist.data(), hist_builder_.GetNumBins());
+      }
     }
 
     inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
@@ -114,6 +119,94 @@ class QuantileHistMaker: public TreeUpdater {
                                HostDeviceVector<bst_float>* p_out_preds);
 
    protected:
+    /* tree growing policies */
+    struct ExpandEntry {
+      int nid;
+      int depth;
+      bst_float loss_chg;
+      unsigned timestamp;
+      ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp)
+              : nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {}
+    };
+
+    struct TreeGrowingPerfMonitor {
+      enum timer_name {INIT_DATA, INIT_NEW_NODE, BUILD_HIST, EVALUATE_SPLIT, APPLY_SPLIT};
+
+      double global_start;
+
+      // performance counters
+      double tstart;
+      double time_init_data = 0;
+      double time_init_new_node = 0;
+      double time_build_hist = 0;
+      double time_evaluate_split = 0;
+      double time_apply_split = 0;
+
+      inline void StartPerfMonitor() {
+        global_start = dmlc::GetTime();
+      }
+
+      inline void EndPerfMonitor() {
+        CHECK_GT(global_start, 0);
+        double total_time = dmlc::GetTime() - global_start;
+        LOG(INFO) << "\nInitData:          "
+                  << std::fixed << std::setw(6) << std::setprecision(4) << time_init_data
+                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+                  << time_init_data / total_time * 100 << "%)\n"
+                  << "InitNewNode:       "
+                  << std::fixed << std::setw(6) << std::setprecision(4) << time_init_new_node
+                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+                  << time_init_new_node / total_time * 100 << "%)\n"
+                  << "BuildHist:         "
+                  << std::fixed << std::setw(6) << std::setprecision(4) << time_build_hist
+                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+                  << time_build_hist / total_time * 100 << "%)\n"
+                  << "EvaluateSplit:     "
+                  << std::fixed << std::setw(6) << std::setprecision(4) << time_evaluate_split
+                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+                  << time_evaluate_split / total_time * 100 << "%)\n"
+                  << "ApplySplit:        "
+                  << std::fixed << std::setw(6) << std::setprecision(4) << time_apply_split
+                  << " (" << std::fixed << std::setw(5) << std::setprecision(2)
+                  << time_apply_split / total_time * 100 << "%)\n"
+                  << "========================================\n"
+                  << "Total:             "
+                  << std::fixed << std::setw(6) << std::setprecision(4) << total_time;
+        // clear performance counters
+        time_init_data = 0;
+        time_init_new_node = 0;
+        time_build_hist = 0;
+        time_evaluate_split = 0;
+        time_apply_split = 0;
+      }
+
+      inline void TickStart() {
+        tstart = dmlc::GetTime();
+      }
+
+      inline void UpdatePerfTimer(const timer_name &timer_name) {
+        CHECK_GT(tstart, 0);
+        switch (timer_name) {
+          case INIT_DATA:
+            time_init_data += dmlc::GetTime() - tstart;
+            break;
+          case INIT_NEW_NODE:
+            time_init_new_node += dmlc::GetTime() - tstart;
+            break;
+          case BUILD_HIST:
+            time_build_hist += dmlc::GetTime() - tstart;
+            break;
+          case EVALUATE_SPLIT:
+            time_evaluate_split += dmlc::GetTime() - tstart;
+            break;
+          case APPLY_SPLIT:
+            time_apply_split += dmlc::GetTime() - tstart;
+            break;
+        }
+        tstart = -1;
+      }
+    };
+
     // initialize temp data structure
     void InitData(const GHistIndexMatrix& gmat,
                   const std::vector<GradientPair>& gpair,
@@ -165,22 +258,45 @@ class QuantileHistMaker: public TreeUpdater {
                         bst_uint fid,
                         bst_uint nodeID);
 
-    /* tree growing policies */
-    struct ExpandEntry {
-      int nid;
-      int depth;
-      bst_float loss_chg;
-      unsigned timestamp;
-      ExpandEntry(int nid, int depth, bst_float loss_chg, unsigned tstmp)
-        : nid(nid), depth(depth), loss_chg(loss_chg), timestamp(tstmp) {}
-    };
-    inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
-      if (lhs.depth == rhs.depth) {
-        return lhs.timestamp > rhs.timestamp;  // favor small timestamp
-      } else {
-        return lhs.depth > rhs.depth;  // favor small depth
-      }
-    }
+    void ExpandWithDepthWidth(const GHistIndexMatrix &gmat,
+                              const GHistIndexBlockMatrix &gmatb,
+                              const ColumnMatrix &column_matrix,
+                              DMatrix *p_fmat,
+                              RegTree *p_tree,
+                              const std::vector<GradientPair> &gpair_h);
+
+    void BuildLocalHistograms(int *starting_index,
+                              int *sync_count,
+                              const GHistIndexMatrix &gmat,
+                              const GHistIndexBlockMatrix &gmatb,
+                              RegTree *p_tree,
+                              const std::vector<GradientPair> &gpair_h);
+
+    void SyncHistograms(int starting_index,
+                        int sync_count,
+                        RegTree *p_tree);
+
+    void BuildNodeStats(const GHistIndexMatrix &gmat,
+                        DMatrix *p_fmat,
+                        RegTree *p_tree,
+                        const std::vector<GradientPair> &gpair_h);
+
+    void EvaluateSplits(const GHistIndexMatrix &gmat,
+                        const ColumnMatrix &column_matrix,
+                        DMatrix *p_fmat,
+                        RegTree *p_tree,
+                        int *num_leaves,
+                        int depth,
+                        unsigned *timestamp,
+                        std::vector<ExpandEntry> *temp_qexpand_depth);
+
+    void ExpandWithLossGuide(const GHistIndexMatrix& gmat,
+                             const GHistIndexBlockMatrix& gmatb,
+                             const ColumnMatrix& column_matrix,
+                             DMatrix* p_fmat,
+                             RegTree* p_tree,
+                             const std::vector<GradientPair>& gpair_h);
+
     inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
       if (lhs.loss_chg == rhs.loss_chg) {
         return lhs.timestamp > rhs.timestamp;  // favor small timestamp
@@ -218,13 +334,20 @@ class QuantileHistMaker: public TreeUpdater {
     const DMatrix* p_last_fmat_;
 
     using ExpandQueue =
-        std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
-                            std::function<bool(ExpandEntry, ExpandEntry)>>;
-    std::unique_ptr<ExpandQueue> qexpand_;
+       std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
+                           std::function<bool(ExpandEntry, ExpandEntry)>>;
+
+    std::unique_ptr<ExpandQueue> qexpand_loss_guided_;
+    std::vector<ExpandEntry> qexpand_depth_wise_;
+    // key is the node id which should be calculated by Subtraction Trick, value is the node which
+    // provides the evidence for substracts
+    std::unordered_map<int, int> nodes_for_subtraction_trick_;
 
     enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData };
     DataLayout data_layout_;
 
+    TreeGrowingPerfMonitor perf_monitor;
+
     rabit::Reducer<GradStats, GradStats::Reduce> histred_;
   };
 
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index de2cb6253..47ed36c92 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -46,7 +46,7 @@ class QuantileHistMock : public QuantileHistMaker {
       GHistIndexBlockMatrix quantile_index_block;
       hist_.AddHistRow(nid);
       BuildHist(gpair, row_set_collection_[nid],
-                gmat, quantile_index_block, hist_[nid]);
+                gmat, quantile_index_block, hist_[nid], false);
       std::vector<GradientPairPrecise> solution {
         {0.27f, 0.29f}, {0.27f, 0.29f}, {0.47f, 0.49f},
         {0.27f, 0.29f}, {0.57f, 0.59f}, {0.26f, 0.27f},
@@ -79,7 +79,7 @@ class QuantileHistMock : public QuantileHistMaker {
       hist_.AddHistRow(0);
 
       BuildHist(row_gpairs, row_set_collection_[0],
-                gmat, quantile_index_block, hist_[0]);
+                gmat, quantile_index_block, hist_[0], false);
 
       RealImpl::InitNewNode(0, gmat, row_gpairs, *(*dmat), tree);
       // Manipulate the root_gain so that I don't have to invent an actual