[jvm-packages] XGBoost Spark integration refactor (#3387)

* add back train method but mark as deprecated

* add back train method but mark as deprecated

* fix scalastyle error

* fix scalastyle error

* [jvm-packages] XGBoost Spark integration refactor. (#3313)

* XGBoost Spark integration refactor.

* Make corresponding update for xgboost4j-example

* Address comments.

* [jvm-packages] Refactor XGBoost-Spark params to make it compatible with both XGBoost and Spark MLLib (#3326)

* Refactor XGBoost-Spark params to make it compatible with both XGBoost and Spark MLLib

* Fix extra space.

* [jvm-packages] XGBoost Spark supports ranking with group data. (#3369)

* XGBoost Spark supports ranking with group data.

* Use Iterator.duplicate to prevent OOM.

* Update CheckpointManagerSuite.scala

* Resolve conflicts
This commit is contained in:
Yanbo Liang
2018-06-18 15:39:18 -07:00
committed by Nan Zhu
parent e6696337e4
commit 2c4359e914
34 changed files with 1921 additions and 2173 deletions

View File

@@ -21,7 +21,7 @@ import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.Source
import ml.dmlc.xgboost4j.scala.spark.{XGBoostEstimator, XGBoost}
import ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer}
@@ -160,10 +160,10 @@ object SparkModelTuningTool {
private def crossValidation(
xgboostParam: Map[String, Any],
trainingData: Dataset[_]): TrainValidationSplitModel = {
val xgbEstimator = new XGBoostEstimator(xgboostParam).setFeaturesCol("features").
val xgbEstimator = new XGBoostRegressor(xgboostParam).setFeaturesCol("features").
setLabelCol("logSales")
val paramGrid = new ParamGridBuilder()
.addGrid(xgbEstimator.round, Array(20, 50))
.addGrid(xgbEstimator.numRound, Array(20, 50))
.addGrid(xgbEstimator.eta, Array(0.1, 0.4))
.build()
val tv = new TrainValidationSplit()

View File

@@ -17,7 +17,7 @@
package ml.dmlc.xgboost4j.scala.example.spark
import ml.dmlc.xgboost4j.scala.Booster
import ml.dmlc.xgboost4j.scala.spark.XGBoost
import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
@@ -45,9 +45,10 @@ object SparkWithDataFrame {
val paramMap = List(
"eta" -> 0.1f,
"max_depth" -> 2,
"objective" -> "binary:logistic").toMap
val xgboostModel = XGBoost.trainWithDataFrame(
trainDF, paramMap, numRound, nWorkers = args(1).toInt, useExternalMemory = true)
"objective" -> "binary:logistic",
"num_round" -> numRound,
"nWorkers" -> args(1).toInt).toMap
val xgboostModel = new XGBoostClassifier(paramMap).fit(trainDF)
// xgboost-spark appends the column containing prediction results
xgboostModel.transform(testDF).show()
}

View File

@@ -1,58 +0,0 @@
/*
Copyright (c) 2014 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.example.spark
import ml.dmlc.xgboost4j.scala.Booster
import ml.dmlc.xgboost4j.scala.spark.XGBoost
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
import org.apache.spark.ml.linalg.{DenseVector => MLDenseVector}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}
object SparkWithRDD {
def main(args: Array[String]): Unit = {
if (args.length != 5) {
println(
"usage: program num_of_rounds num_workers training_path test_path model_path")
sys.exit(1)
}
val sparkConf = new SparkConf().setAppName("XGBoost-spark-example")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.registerKryoClasses(Array(classOf[Booster]))
implicit val sc = new SparkContext(sparkConf)
val inputTrainPath = args(2)
val inputTestPath = args(3)
val outputModelPath = args(4)
// number of iterations
val numRound = args(0).toInt
val trainRDD = MLUtils.loadLibSVMFile(sc, inputTrainPath).map(lp =>
MLLabeledPoint(lp.label, new MLDenseVector(lp.features.toArray)))
val testSet = MLUtils.loadLibSVMFile(sc, inputTestPath)
.map(lp => new MLDenseVector(lp.features.toArray))
// training parameters
val paramMap = List(
"eta" -> 0.1f,
"max_depth" -> 2,
"objective" -> "binary:logistic").toMap
val xgboostModel = XGBoost.trainWithRDD(trainRDD, paramMap, numRound, nWorkers = args(1).toInt,
useExternalMemory = true)
xgboostModel.predict(testSet, missingValue = Float.NaN)
// save model to HDFS path
xgboostModel.saveModelAsHadoopFile(outputModelPath)
}
}