[jvm-packages] Scala/Java interface for Fast Histogram Algorithm (#1966)

* add back train method but mark as deprecated * fix scalastyle error * first commit in scala binding for fast histo * java test * add missed scala tests * spark training * add back train method but mark as deprecated * fix scalastyle error * local change * first commit in scala binding for fast histo * local change * fix df frame test
2017-03-04 15:37:24 -08:00
parent ac30a0aff5
commit ab13fd72bd
10 changed files with 400 additions and 37 deletions
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
@@ -126,9 +126,22 @@ trait BoosterParams extends Params {
   *  [default='auto']
   */
  val treeMethod = new Param[String](this, "tree_method",
-    "The tree construction algorithm used in XGBoost, options: {'auto', 'exact', 'approx'}",
+    "The tree construction algorithm used in XGBoost, options: {'auto', 'exact', 'approx', 'hist'}",
    (value: String) => BoosterParams.supportedTreeMethods.contains(value))

+  /**
+   * growth policy for fast histogram algorithm
+   */
+  val growthPolicty = new Param[String](this, "grow_policy",
+    "growth policy for fast histogram algorithm",
+    (value: String) => BoosterParams.supportedGrowthPolicies.contains(value))
+
+  /**
+   * maximum number of bins in histogram
+   */
+  val maxBins = new IntParam(this, "max_bin", "maximum number of bins in histogram",
+    (value: Int) => value > 0)
+
  /**
   * This is only used for approximate greedy algorithm.
   * This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select
@@ -194,6 +207,7 @@ trait BoosterParams extends Params {

  setDefault(boosterType -> "gbtree", eta -> 0.3, gamma -> 0, maxDepth -> 6,
    minChildWeight -> 1, maxDeltaStep -> 0,
+    growthPolicty -> "depthwise", maxBins -> 16,
    subSample -> 1, colSampleByTree -> 1, colSampleByLevel -> 1,
    lambda -> 1, alpha -> 0, treeMethod -> "auto", sketchEps -> 0.03,
    scalePosWeight -> 1.0, sampleType -> "uniform", normalizeType -> "tree",
@@ -227,7 +241,9 @@ private[spark] object BoosterParams {

  val supportedBoosters = HashSet("gbtree", "gblinear", "dart")

-  val supportedTreeMethods = HashSet("auto", "exact", "approx")
+  val supportedTreeMethods = HashSet("auto", "exact", "approx", "hist")
+
+  val supportedGrowthPolicies = HashSet("depthwise", "lossguide")

  val supportedSampleType = HashSet("uniform", "weighted")

--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala
@@ -190,6 +190,22 @@ class XGBoostDFSuite extends SharedSparkContext with Utils {
    assert(xgbEstimatorCopy1.fromParamsToXGBParamMap("eval_metric") === "logloss")
  }

+  test("fast histogram algorithm parameters are exposed correctly") {
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
+      "objective" -> "binary:logistic", "tree_method" -> "hist",
+      "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
+      "eval_metric" -> "error")
+    val testItr = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val trainingDF = buildTrainingDataframe()
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 10, nWorkers = math.min(2, numWorkers))
+    val error = new EvalError
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testItr, null))
+    assert(error.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
+      testSetDMatrix) < 0.1)
+  }
+
  private def convertCSVPointToLabelPoint(valueArray: Array[String]): LabeledPoint = {
    val intValueArray = new Array[Double](valueArray.length)
    intValueArray(valueArray.length - 2) = {
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -111,11 +111,94 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
      "objective" -> "binary:logistic",
      "tracker_conf" -> TrackerConf(1 minute, "scala")).toMap
    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
-      nWorkers = numWorkers, useExternalMemory = true)
+      nWorkers = numWorkers)
    assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
      testSetDMatrix) < 0.1)
  }

+  test("test with fast histo depthwise") {
+    val eval = new EvalError()
+    val trainingRDD = buildTrainingRDD(sc)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "binary:logistic", "tree_method" -> "hist",
+      "grow_policy" -> "depthwise", "eval_metric" -> "error")
+    // TODO: histogram algorithm seems to be very very sensitive to worker number
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
+      nWorkers = math.min(numWorkers, 2))
+    assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
+      testSetDMatrix) < 0.1)
+  }
+
+  test("test with fast histo lossguide") {
+    val eval = new EvalError()
+    val trainingRDD = buildTrainingRDD(sc)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "1",
+            "objective" -> "binary:logistic", "tree_method" -> "hist",
+            "grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "error")
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
+      nWorkers = math.min(numWorkers, 2))
+    val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
+      testSetDMatrix)
+    assert(x < 0.1)
+  }
+
+  test("test with fast histo lossguide with max bin") {
+    val eval = new EvalError()
+    val trainingRDD = buildTrainingRDD(sc)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
+            "objective" -> "binary:logistic", "tree_method" -> "hist",
+            "grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
+            "eval_metric" -> "error")
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
+      nWorkers = math.min(numWorkers, 2))
+    val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
+      testSetDMatrix)
+    assert(x < 0.1)
+  }
+
+  test("test with fast histo depthwidth with max depth") {
+    val eval = new EvalError()
+    val trainingRDD = buildTrainingRDD(sc)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
+      "objective" -> "binary:logistic", "tree_method" -> "hist",
+      "grow_policy" -> "depthwise", "max_leaves" -> "8", "max_depth" -> "2",
+      "eval_metric" -> "error")
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 10,
+      nWorkers = math.min(numWorkers, 2))
+    val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
+      testSetDMatrix)
+    assert(x < 0.1)
+  }
+
+  test("test with fast histo depthwidth with max depth and max bin") {
+    val eval = new EvalError()
+    val trainingRDD = buildTrainingRDD(sc)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
+    val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
+            "objective" -> "binary:logistic", "tree_method" -> "hist",
+            "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
+            "eval_metric" -> "error")
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 10,
+      nWorkers = math.min(numWorkers, 2))
+    val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
+      testSetDMatrix)
+    assert(x < 0.1)
+  }
+
  test("test with dense vectors containing missing value") {
    def buildDenseRDD(): RDD[LabeledPoint] = {
      val nrow = 100
@@ -142,6 +225,7 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
      }
      sc.parallelize(points)
    }
+
    val trainingRDD = buildDenseRDD().repartition(4)
    val testRDD = buildDenseRDD().repartition(4)
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
@@ -189,6 +273,7 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
      val sampleList = new ListBuffer[SparkVector]
      sparkContext.getOrElse(sc).parallelize(sampleList, numWorkers)
    }
+
    val trainingRDD = buildTrainingRDD(sc)
    val testRDD = buildEmptyRDD()
    val tempDir = Files.createTempDirectory("xgboosttest-")