[jvm-packages] XGBoost Spark integration refactor (#3387)

* add back train method but mark as deprecated * add back train method but mark as deprecated * fix scalastyle error * fix scalastyle error * [jvm-packages] XGBoost Spark integration refactor. (#3313) * XGBoost Spark integration refactor. * Make corresponding update for xgboost4j-example * Address comments. * [jvm-packages] Refactor XGBoost-Spark params to make it compatible with both XGBoost and Spark MLLib (#3326) * Refactor XGBoost-Spark params to make it compatible with both XGBoost and Spark MLLib * Fix extra space. * [jvm-packages] XGBoost Spark supports ranking with group data. (#3369) * XGBoost Spark supports ranking with group data. * Use Iterator.duplicate to prevent OOM. * Update CheckpointManagerSuite.scala * Resolve conflicts
2018-06-18 15:39:18 -07:00
parent e6696337e4
commit 2c4359e914
34 changed files with 1921 additions and 2173 deletions
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkModelTuningTool.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkModelTuningTool.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 import scala.io.Source

-import ml.dmlc.xgboost4j.scala.spark.{XGBoostEstimator, XGBoost}
+import ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.evaluation.RegressionEvaluator
 import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer}
@@ -160,10 +160,10 @@ object SparkModelTuningTool {
  private def crossValidation(
      xgboostParam: Map[String, Any],
      trainingData: Dataset[_]): TrainValidationSplitModel = {
-    val xgbEstimator = new XGBoostEstimator(xgboostParam).setFeaturesCol("features").
+    val xgbEstimator = new XGBoostRegressor(xgboostParam).setFeaturesCol("features").
      setLabelCol("logSales")
    val paramGrid = new ParamGridBuilder()
-      .addGrid(xgbEstimator.round, Array(20, 50))
+      .addGrid(xgbEstimator.numRound, Array(20, 50))
      .addGrid(xgbEstimator.eta, Array(0.1, 0.4))
      .build()
    val tv = new TrainValidationSplit()
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithDataFrame.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithDataFrame.scala
@@ -17,7 +17,7 @@
 package ml.dmlc.xgboost4j.scala.example.spark

 import ml.dmlc.xgboost4j.scala.Booster
-import ml.dmlc.xgboost4j.scala.spark.XGBoost
+import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.SparkConf

@@ -45,9 +45,10 @@ object SparkWithDataFrame {
    val paramMap = List(
      "eta" -> 0.1f,
      "max_depth" -> 2,
-      "objective" -> "binary:logistic").toMap
-    val xgboostModel = XGBoost.trainWithDataFrame(
-      trainDF, paramMap, numRound, nWorkers = args(1).toInt, useExternalMemory = true)
+      "objective" -> "binary:logistic",
+      "num_round" -> numRound,
+      "nWorkers" -> args(1).toInt).toMap
+    val xgboostModel = new XGBoostClassifier(paramMap).fit(trainDF)
    // xgboost-spark appends the column containing prediction results
    xgboostModel.transform(testDF).show()
  }
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithRDD.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithRDD.scala
@@ -1,58 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.example.spark
-
-import ml.dmlc.xgboost4j.scala.Booster
-import ml.dmlc.xgboost4j.scala.spark.XGBoost
-
-import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
-import org.apache.spark.ml.linalg.{DenseVector => MLDenseVector}
-import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.{SparkConf, SparkContext}
-
-object SparkWithRDD {
-  def main(args: Array[String]): Unit = {
-    if (args.length != 5) {
-      println(
-        "usage: program num_of_rounds num_workers training_path test_path model_path")
-      sys.exit(1)
-    }
-    val sparkConf = new SparkConf().setAppName("XGBoost-spark-example")
-      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-    sparkConf.registerKryoClasses(Array(classOf[Booster]))
-    implicit val sc = new SparkContext(sparkConf)
-    val inputTrainPath = args(2)
-    val inputTestPath = args(3)
-    val outputModelPath = args(4)
-    // number of iterations
-    val numRound = args(0).toInt
-    val trainRDD = MLUtils.loadLibSVMFile(sc, inputTrainPath).map(lp =>
-      MLLabeledPoint(lp.label, new MLDenseVector(lp.features.toArray)))
-    val testSet = MLUtils.loadLibSVMFile(sc, inputTestPath)
-        .map(lp => new MLDenseVector(lp.features.toArray))
-    // training parameters
-    val paramMap = List(
-      "eta" -> 0.1f,
-      "max_depth" -> 2,
-      "objective" -> "binary:logistic").toMap
-    val xgboostModel = XGBoost.trainWithRDD(trainRDD, paramMap, numRound, nWorkers = args(1).toInt,
-      useExternalMemory = true)
-    xgboostModel.predict(testSet, missingValue = Float.NaN)
-    // save model to HDFS path
-    xgboostModel.saveModelAsHadoopFile(outputModelPath)
-  }
-}