[jvm-packages] XGBoost Spark integration refactor (#3387)
* add back train method but mark as deprecated * add back train method but mark as deprecated * fix scalastyle error * fix scalastyle error * [jvm-packages] XGBoost Spark integration refactor. (#3313) * XGBoost Spark integration refactor. * Make corresponding update for xgboost4j-example * Address comments. * [jvm-packages] Refactor XGBoost-Spark params to make it compatible with both XGBoost and Spark MLLib (#3326) * Refactor XGBoost-Spark params to make it compatible with both XGBoost and Spark MLLib * Fix extra space. * [jvm-packages] XGBoost Spark supports ranking with group data. (#3369) * XGBoost Spark supports ranking with group data. * Use Iterator.duplicate to prevent OOM. * Update CheckpointManagerSuite.scala * Resolve conflicts
This commit is contained in:
@@ -21,37 +21,27 @@ import java.nio.file.Files
|
||||
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
class CheckpointManagerSuite extends FunSuite with BeforeAndAfterAll {
|
||||
var sc: SparkContext = _
|
||||
|
||||
override def beforeAll(): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
.setMaster("local[*]")
|
||||
.setAppName("XGBoostSuite")
|
||||
sc = new SparkContext(conf)
|
||||
}
|
||||
class CheckpointManagerSuite extends FunSuite with PerTest with BeforeAndAfterAll {
|
||||
|
||||
private lazy val (model4, model8) = {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML).cache()
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
(XGBoost.trainWithRDD(trainingRDD, paramMap, round = 2, nWorkers = sc.defaultParallelism),
|
||||
XGBoost.trainWithRDD(trainingRDD, paramMap, round = 4, nWorkers = sc.defaultParallelism))
|
||||
"objective" -> "binary:logistic", "num_workers" -> sc.defaultParallelism)
|
||||
(new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training),
|
||||
new XGBoostClassifier(paramMap ++ Seq("num_round" -> 4)).fit(training))
|
||||
}
|
||||
|
||||
test("test update/load models") {
|
||||
val tmpPath = Files.createTempDirectory("test").toAbsolutePath.toString
|
||||
val manager = new CheckpointManager(sc, tmpPath)
|
||||
manager.updateCheckpoint(model4)
|
||||
manager.updateCheckpoint(model4._booster)
|
||||
var files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
|
||||
assert(files.length == 1)
|
||||
assert(files.head.getPath.getName == "4.model")
|
||||
assert(manager.loadCheckpointAsBooster.booster.getVersion == 4)
|
||||
|
||||
manager.updateCheckpoint(model8)
|
||||
manager.updateCheckpoint(model8._booster)
|
||||
files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
|
||||
assert(files.length == 1)
|
||||
assert(files.head.getPath.getName == "8.model")
|
||||
@@ -61,7 +51,7 @@ class CheckpointManagerSuite extends FunSuite with BeforeAndAfterAll {
|
||||
test("test cleanUpHigherVersions") {
|
||||
val tmpPath = Files.createTempDirectory("test").toAbsolutePath.toString
|
||||
val manager = new CheckpointManager(sc, tmpPath)
|
||||
manager.updateCheckpoint(model8)
|
||||
manager.updateCheckpoint(model8._booster)
|
||||
manager.cleanUpHigherVersions(round = 8)
|
||||
assert(new File(s"$tmpPath/8.model").exists())
|
||||
|
||||
@@ -74,7 +64,8 @@ class CheckpointManagerSuite extends FunSuite with BeforeAndAfterAll {
|
||||
val manager = new CheckpointManager(sc, tmpPath)
|
||||
assertResult(Seq(7))(manager.getCheckpointRounds(checkpointInterval = 0, round = 7))
|
||||
assertResult(Seq(2, 4, 6, 7))(manager.getCheckpointRounds(checkpointInterval = 2, round = 7))
|
||||
manager.updateCheckpoint(model4)
|
||||
manager.updateCheckpoint(model4._booster)
|
||||
assertResult(Seq(4, 6, 7))(manager.getCheckpointRounds(2, 7))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -18,11 +18,13 @@ package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.io.File
|
||||
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql._
|
||||
import org.scalatest.{BeforeAndAfterEach, FunSuite}
|
||||
|
||||
trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
|
||||
|
||||
protected val numWorkers: Int = Runtime.getRuntime.availableProcessors()
|
||||
|
||||
@transient private var currentSession: SparkSession = _
|
||||
@@ -62,4 +64,30 @@ trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
|
||||
file.delete()
|
||||
}
|
||||
}
|
||||
|
||||
protected def buildDataFrame(
|
||||
labeledPoints: Seq[XGBLabeledPoint],
|
||||
numPartitions: Int = numWorkers): DataFrame = {
|
||||
import DataUtils._
|
||||
val it = labeledPoints.iterator.zipWithIndex
|
||||
.map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
|
||||
(id, labeledPoint.label, labeledPoint.features)
|
||||
}
|
||||
|
||||
ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
|
||||
.toDF("id", "label", "features")
|
||||
}
|
||||
|
||||
protected def buildDataFrameWithGroup(
|
||||
labeledPoints: Seq[XGBLabeledPoint],
|
||||
numPartitions: Int = numWorkers): DataFrame = {
|
||||
import DataUtils._
|
||||
val it = labeledPoints.iterator.zipWithIndex
|
||||
.map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
|
||||
(id, labeledPoint.label, labeledPoint.features, labeledPoint.group)
|
||||
}
|
||||
|
||||
ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
|
||||
.toDF("id", "label", "features", "group")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,167 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.io.{File, FileNotFoundException}
|
||||
import java.util.Arrays
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
|
||||
import scala.util.Random
|
||||
import org.apache.spark.ml.feature._
|
||||
import org.apache.spark.ml.{Pipeline, PipelineModel}
|
||||
import org.apache.spark.network.util.JavaUtils
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
|
||||
class PersistenceSuite extends FunSuite with PerTest with BeforeAndAfterAll {
|
||||
|
||||
private var tempDir: File = _
|
||||
|
||||
override def beforeAll(): Unit = {
|
||||
super.beforeAll()
|
||||
|
||||
tempDir = new File(System.getProperty("java.io.tmpdir"), this.getClass.getName)
|
||||
if (tempDir.exists) {
|
||||
tempDir.delete
|
||||
}
|
||||
tempDir.mkdirs
|
||||
}
|
||||
|
||||
override def afterAll(): Unit = {
|
||||
JavaUtils.deleteRecursively(tempDir)
|
||||
super.afterAll()
|
||||
}
|
||||
|
||||
private def delete(f: File) {
|
||||
if (f.exists) {
|
||||
if (f.isDirectory) {
|
||||
for (c <- f.listFiles) {
|
||||
delete(c)
|
||||
}
|
||||
}
|
||||
if (!f.delete) {
|
||||
throw new FileNotFoundException("Failed to delete file: " + f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
|
||||
val eval = new EvalError()
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> "10", "num_workers" -> numWorkers)
|
||||
val xgbc = new XGBoostClassifier(paramMap)
|
||||
val xgbcPath = new File(tempDir, "xgbc").getPath
|
||||
xgbc.write.overwrite().save(xgbcPath)
|
||||
val xgbc2 = XGBoostClassifier.load(xgbcPath)
|
||||
val paramMap2 = xgbc2.MLlib2XGBoostParams
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v.toString == paramMap2(k).toString)
|
||||
}
|
||||
|
||||
val model = xgbc.fit(trainingDF)
|
||||
val evalResults = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(evalResults < 0.1)
|
||||
val xgbcModelPath = new File(tempDir, "xgbcModel").getPath
|
||||
model.write.overwrite.save(xgbcModelPath)
|
||||
val model2 = XGBoostClassificationModel.load(xgbcModelPath)
|
||||
assert(Arrays.equals(model._booster.toByteArray, model2._booster.toByteArray))
|
||||
|
||||
assert(model.getEta === model2.getEta)
|
||||
assert(model.getNumRound === model2.getNumRound)
|
||||
assert(model.getRawPredictionCol === model2.getRawPredictionCol)
|
||||
val evalResults2 = eval.eval(model2._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(evalResults === evalResults2)
|
||||
}
|
||||
|
||||
test("test persistence of XGBoostRegressor and XGBoostRegressionModel") {
|
||||
val eval = new EvalError()
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
val testDM = new DMatrix(Regression.test.iterator)
|
||||
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "num_round" -> "10", "num_workers" -> numWorkers)
|
||||
val xgbr = new XGBoostRegressor(paramMap)
|
||||
val xgbrPath = new File(tempDir, "xgbr").getPath
|
||||
xgbr.write.overwrite().save(xgbrPath)
|
||||
val xgbr2 = XGBoostRegressor.load(xgbrPath)
|
||||
val paramMap2 = xgbr2.MLlib2XGBoostParams
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v.toString == paramMap2(k).toString)
|
||||
}
|
||||
|
||||
val model = xgbr.fit(trainingDF)
|
||||
val evalResults = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(evalResults < 0.1)
|
||||
val xgbrModelPath = new File(tempDir, "xgbrModel").getPath
|
||||
model.write.overwrite.save(xgbrModelPath)
|
||||
val model2 = XGBoostRegressionModel.load(xgbrModelPath)
|
||||
assert(Arrays.equals(model._booster.toByteArray, model2._booster.toByteArray))
|
||||
|
||||
assert(model.getEta === model2.getEta)
|
||||
assert(model.getNumRound === model2.getNumRound)
|
||||
assert(model.getPredictionCol === model2.getPredictionCol)
|
||||
val evalResults2 = eval.eval(model2._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(evalResults === evalResults2)
|
||||
}
|
||||
|
||||
test("test persistence of MLlib pipeline with XGBoostClassificationModel") {
|
||||
|
||||
val r = new Random(0)
|
||||
// maybe move to shared context, but requires session to import implicits
|
||||
val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
|
||||
toDF("feature", "label")
|
||||
|
||||
val assembler = new VectorAssembler()
|
||||
.setInputCols(df.columns.filter(!_.contains("label")))
|
||||
.setOutputCol("features")
|
||||
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> "10", "num_workers" -> numWorkers,
|
||||
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
|
||||
// Construct MLlib pipeline, save and load
|
||||
val pipeline = new Pipeline().setStages(Array(assembler, xgb))
|
||||
val pipePath = new File(tempDir, "pipeline").getPath
|
||||
pipeline.write.overwrite().save(pipePath)
|
||||
val pipeline2 = Pipeline.read.load(pipePath)
|
||||
val xgb2 = pipeline2.getStages(1).asInstanceOf[XGBoostClassifier]
|
||||
val paramMap2 = xgb2.MLlib2XGBoostParams
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v.toString == paramMap2(k).toString)
|
||||
}
|
||||
|
||||
// Model training, save and load
|
||||
val pipeModel = pipeline.fit(df)
|
||||
val pipeModelPath = new File(tempDir, "pipelineModel").getPath
|
||||
pipeModel.write.overwrite.save(pipeModelPath)
|
||||
val pipeModel2 = PipelineModel.load(pipeModelPath)
|
||||
|
||||
val xgbModel = pipeModel.stages(1).asInstanceOf[XGBoostClassificationModel]
|
||||
val xgbModel2 = pipeModel2.stages(1).asInstanceOf[XGBoostClassificationModel]
|
||||
|
||||
assert(Arrays.equals(xgbModel._booster.toByteArray, xgbModel2._booster.toByteArray))
|
||||
|
||||
assert(xgbModel.getEta === xgbModel2.getEta)
|
||||
assert(xgbModel.getNumRound === xgbModel2.getNumRound)
|
||||
assert(xgbModel.getRawPredictionCol === xgbModel2.getRawPredictionCol)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.collection.mutable
|
||||
import scala.io.Source
|
||||
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
trait TrainTestData {
|
||||
@@ -48,6 +48,17 @@ trait TrainTestData {
|
||||
XGBLabeledPoint(label, null, values)
|
||||
}.toList
|
||||
}
|
||||
|
||||
protected def getLabeledPointsWithGroup(resource: String): Seq[XGBLabeledPoint] = {
|
||||
getResourceLines(resource).map { line =>
|
||||
val original = line.split(",")
|
||||
val length = original.length
|
||||
val label = original.head.toFloat
|
||||
val group = original.last.toInt
|
||||
val values = original.slice(1, length - 1).map(_.toFloat)
|
||||
XGBLabeledPoint(label, null, values, 1f, group, Float.NaN)
|
||||
}.toList
|
||||
}
|
||||
}
|
||||
|
||||
object Classification extends TrainTestData {
|
||||
@@ -80,11 +91,8 @@ object Regression extends TrainTestData {
|
||||
}
|
||||
|
||||
object Ranking extends TrainTestData {
|
||||
val train0: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo-0.txt.train", zeroBased = false)
|
||||
val train1: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo-1.txt.train", zeroBased = false)
|
||||
val trainGroup0: Seq[Int] = getGroups("/rank-demo-0.txt.train.group")
|
||||
val trainGroup1: Seq[Int] = getGroups("/rank-demo-1.txt.train.group")
|
||||
val test: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo.txt.test", zeroBased = false)
|
||||
val train: Seq[XGBLabeledPoint] = getLabeledPointsWithGroup("/rank.train.csv")
|
||||
val test: Seq[XGBLabeledPoint] = getLabeledPoints("/rank.test.txt", zeroBased = false)
|
||||
|
||||
private def getGroups(resource: String): Seq[Int] = {
|
||||
getResourceLines(resource).map(_.toInt).toList
|
||||
|
||||
@@ -0,0 +1,207 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import org.apache.spark.ml.linalg._
|
||||
import org.apache.spark.ml.param.ParamMap
|
||||
import org.apache.spark.sql._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
|
||||
test("XGBoost-Spark XGBoostClassifier ouput should match XGBoost4j") {
|
||||
val trainingDM = new DMatrix(Classification.train.iterator)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
val round = 5
|
||||
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
|
||||
val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
|
||||
val prediction1 = model1.predict(testDM)
|
||||
|
||||
val model2 = new XGBoostClassifier(paramMap ++ Array("num_round" -> round,
|
||||
"num_workers" -> numWorkers)).fit(trainingDF)
|
||||
|
||||
val prediction2 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("probability"))).toMap
|
||||
|
||||
assert(testDF.count() === prediction2.size)
|
||||
// the vector length in probability column is 2 since we have to fit to the evaluator in Spark
|
||||
for (i <- prediction1.indices) {
|
||||
assert(prediction1(i).length === prediction2(i).values.length - 1)
|
||||
for (j <- prediction1(i).indices) {
|
||||
assert(prediction1(i)(j) === prediction2(i)(j + 1))
|
||||
}
|
||||
}
|
||||
|
||||
val prediction3 = model1.predict(testDM, outPutMargin = true)
|
||||
val prediction4 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("rawPrediction"))).toMap
|
||||
|
||||
assert(testDF.count() === prediction4.size)
|
||||
for (i <- prediction3.indices) {
|
||||
assert(prediction3(i).length === prediction4(i).values.length)
|
||||
for (j <- prediction3(i).indices) {
|
||||
assert(prediction3(i)(j) === prediction4(i)(j))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("Set params in XGBoost and MLlib way should produce same model") {
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
val round = 5
|
||||
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"num_round" -> round,
|
||||
"num_workers" -> numWorkers)
|
||||
|
||||
// Set params in XGBoost way
|
||||
val model1 = new XGBoostClassifier(paramMap).fit(trainingDF)
|
||||
// Set params in MLlib way
|
||||
val model2 = new XGBoostClassifier()
|
||||
.setEta(1)
|
||||
.setMaxDepth(6)
|
||||
.setSilent(1)
|
||||
.setObjective("binary:logistic")
|
||||
.setNumRound(round)
|
||||
.setNumWorkers(numWorkers)
|
||||
.fit(trainingDF)
|
||||
|
||||
val prediction1 = model1.transform(testDF).select("prediction").collect()
|
||||
val prediction2 = model2.transform(testDF).select("prediction").collect()
|
||||
|
||||
prediction1.zip(prediction2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
|
||||
assert(p1 === p2)
|
||||
}
|
||||
}
|
||||
|
||||
test("test schema of XGBoostClassificationModel") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
|
||||
val model = new XGBoostClassifier(paramMap).fit(trainingDF)
|
||||
|
||||
model.setRawPredictionCol("raw_prediction")
|
||||
.setProbabilityCol("probability_prediction")
|
||||
.setPredictionCol("final_prediction")
|
||||
var predictionDF = model.transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("raw_prediction"))
|
||||
assert(predictionDF.columns.contains("probability_prediction"))
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
model.setRawPredictionCol("").setPredictionCol("final_prediction")
|
||||
predictionDF = model.transform(testDF)
|
||||
assert(predictionDF.columns.contains("raw_prediction") === false)
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
model.setRawPredictionCol("raw_prediction").setPredictionCol("")
|
||||
predictionDF = model.transform(testDF)
|
||||
assert(predictionDF.columns.contains("raw_prediction"))
|
||||
assert(predictionDF.columns.contains("final_prediction") === false)
|
||||
|
||||
assert(model.summary.trainObjectiveHistory.length === 5)
|
||||
assert(model.summary.testObjectiveHistory.isEmpty)
|
||||
}
|
||||
|
||||
test("XGBoost and Spark parameters synchronize correctly") {
|
||||
val xgbParamMap = Map("eta" -> "1", "objective" -> "binary:logistic")
|
||||
// from xgboost params to spark params
|
||||
val xgb = new XGBoostClassifier(xgbParamMap)
|
||||
assert(xgb.getEta === 1.0)
|
||||
assert(xgb.getObjective === "binary:logistic")
|
||||
// from spark to xgboost params
|
||||
val xgbCopy = xgb.copy(ParamMap.empty)
|
||||
assert(xgbCopy.MLlib2XGBoostParams("eta").toString.toDouble === 1.0)
|
||||
assert(xgbCopy.MLlib2XGBoostParams("objective").toString === "binary:logistic")
|
||||
val xgbCopy2 = xgb.copy(ParamMap.empty.put(xgb.evalMetric, "logloss"))
|
||||
assert(xgbCopy2.MLlib2XGBoostParams("eval_metric").toString === "logloss")
|
||||
}
|
||||
|
||||
test("multi class classification") {
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6", "num_round" -> 5,
|
||||
"num_workers" -> numWorkers)
|
||||
val trainingDF = buildDataFrame(MultiClassification.train)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(trainingDF)
|
||||
assert(model.getEta == 0.1)
|
||||
assert(model.getMaxDepth == 6)
|
||||
assert(model.numClasses == 6)
|
||||
}
|
||||
|
||||
test("use base margin") {
|
||||
val training1 = buildDataFrame(Classification.train)
|
||||
val training2 = training1.withColumn("margin", functions.rand())
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "test_train_split" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model1 = xgb.fit(training1)
|
||||
val model2 = xgb.setBaseMarginCol("margin").fit(training2)
|
||||
val prediction1 = model1.transform(test).select(model1.getProbabilityCol)
|
||||
.collect().map(row => row.getAs[Vector](0))
|
||||
val prediction2 = model2.transform(test).select(model2.getProbabilityCol)
|
||||
.collect().map(row => row.getAs[Vector](0))
|
||||
var count = 0
|
||||
for ((r1, r2) <- prediction1.zip(prediction2)) {
|
||||
if (!r1.equals(r2)) count = count + 1
|
||||
}
|
||||
assert(count != 0)
|
||||
}
|
||||
|
||||
test("training summary") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "nWorkers" -> numWorkers)
|
||||
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(trainingDF)
|
||||
|
||||
assert(model.summary.trainObjectiveHistory.length === 5)
|
||||
assert(model.summary.testObjectiveHistory.isEmpty)
|
||||
}
|
||||
|
||||
test("train/test split") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(training)
|
||||
val Some(testObjectiveHistory) = model.summary.testObjectiveHistory
|
||||
assert(testObjectiveHistory.length === 5)
|
||||
assert(model.summary.trainObjectiveHistory !== testObjectiveHistory)
|
||||
}
|
||||
}
|
||||
@@ -17,36 +17,34 @@
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
|
||||
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostConfigureSuite extends FunSuite with PerTest {
|
||||
|
||||
override def sparkSessionBuilder: SparkSession.Builder = super.sparkSessionBuilder
|
||||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
.config("spark.kryo.classesToRegister", classOf[Booster].getName)
|
||||
|
||||
test("nthread configuration must be no larger than spark.task.cpus") {
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"objective" -> "binary:logistic", "num_workers" -> numWorkers,
|
||||
"nthread" -> (sc.getConf.getInt("spark.task.cpus", 1) + 1))
|
||||
intercept[IllegalArgumentException] {
|
||||
XGBoost.trainWithRDD(sc.parallelize(List()), paramMap, 5, numWorkers)
|
||||
new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training)
|
||||
}
|
||||
}
|
||||
|
||||
test("kryoSerializer test") {
|
||||
import DataUtils._
|
||||
// TODO write an isolated test for Booster.
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator, null)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator, null)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val eval = new EvalError()
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,265 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.DenseVector
|
||||
import org.apache.spark.ml.param.ParamMap
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types.DataTypes
|
||||
import org.scalatest.FunSuite
|
||||
import org.scalatest.prop.TableDrivenPropertyChecks
|
||||
|
||||
class XGBoostDFSuite extends FunSuite with PerTest with TableDrivenPropertyChecks {
|
||||
private def buildDataFrame(
|
||||
labeledPoints: Seq[XGBLabeledPoint],
|
||||
numPartitions: Int = numWorkers): DataFrame = {
|
||||
import DataUtils._
|
||||
val it = labeledPoints.iterator.zipWithIndex
|
||||
.map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
|
||||
(id, labeledPoint.label, labeledPoint.features)
|
||||
}
|
||||
|
||||
ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
|
||||
.toDF("id", "label", "features")
|
||||
}
|
||||
|
||||
test("test consistency and order preservation of dataframe-based model") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val trainingItr = Classification.train.iterator
|
||||
val testItr = Classification.test.iterator
|
||||
val round = 5
|
||||
val trainDMatrix = new DMatrix(trainingItr)
|
||||
val testDMatrix = new DMatrix(testItr)
|
||||
val xgboostModel = ScalaXGBoost.train(trainDMatrix, paramMap, round)
|
||||
val predResultFromSeq = xgboostModel.predict(testDMatrix)
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = round, nWorkers = numWorkers)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
val predResultsFromDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("probabilities"))).toMap
|
||||
assert(testDF.count() === predResultsFromDF.size)
|
||||
// the vector length in probabilties column is 2 since we have to fit to the evaluator in
|
||||
// Spark
|
||||
for (i <- predResultFromSeq.indices) {
|
||||
assert(predResultFromSeq(i).length === predResultsFromDF(i).values.length - 1)
|
||||
for (j <- predResultFromSeq(i).indices) {
|
||||
assert(predResultFromSeq(i)(j) === predResultsFromDF(i)(j + 1))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("test transformLeaf") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
xgBoostModelWithDF.transformLeaf(testDF).show()
|
||||
}
|
||||
|
||||
test("test schema of XGBoostRegressionModel") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = true)
|
||||
xgBoostModelWithDF.setPredictionCol("final_prediction")
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val predictionDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
predictionDF.show()
|
||||
}
|
||||
|
||||
test("test schema of XGBoostClassificationModel") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = true)
|
||||
xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol(
|
||||
"raw_prediction").setPredictionCol("final_prediction")
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
var predictionDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("raw_prediction"))
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("").
|
||||
setPredictionCol("final_prediction")
|
||||
predictionDF = xgBoostModelWithDF.transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("raw_prediction") === false)
|
||||
assert(predictionDF.columns.contains("final_prediction"))
|
||||
xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].
|
||||
setRawPredictionCol("raw_prediction").setPredictionCol("")
|
||||
predictionDF = xgBoostModelWithDF.transform(testDF)
|
||||
assert(predictionDF.columns.contains("id"))
|
||||
assert(predictionDF.columns.contains("features"))
|
||||
assert(predictionDF.columns.contains("label"))
|
||||
assert(predictionDF.columns.contains("raw_prediction"))
|
||||
assert(predictionDF.columns.contains("final_prediction") === false)
|
||||
}
|
||||
|
||||
test("xgboost and spark parameters synchronize correctly") {
|
||||
val xgbParamMap = Map("eta" -> "1", "objective" -> "binary:logistic")
|
||||
// from xgboost params to spark params
|
||||
val xgbEstimator = new XGBoostEstimator(xgbParamMap)
|
||||
assert(xgbEstimator.get(xgbEstimator.eta).get === 1.0)
|
||||
assert(xgbEstimator.get(xgbEstimator.objective).get === "binary:logistic")
|
||||
// from spark to xgboost params
|
||||
val xgbEstimatorCopy = xgbEstimator.copy(ParamMap.empty)
|
||||
assert(xgbEstimatorCopy.fromParamsToXGBParamMap("eta").toString.toDouble === 1.0)
|
||||
assert(xgbEstimatorCopy.fromParamsToXGBParamMap("objective").toString === "binary:logistic")
|
||||
}
|
||||
|
||||
test("eval_metric is configured correctly") {
|
||||
val xgbParamMap = Map("eta" -> "1", "objective" -> "binary:logistic")
|
||||
val xgbEstimator = new XGBoostEstimator(xgbParamMap)
|
||||
assert(xgbEstimator.get(xgbEstimator.evalMetric).get === "error")
|
||||
val sparkParamMap = ParamMap.empty
|
||||
val xgbEstimatorCopy = xgbEstimator.copy(sparkParamMap)
|
||||
assert(xgbEstimatorCopy.fromParamsToXGBParamMap("eval_metric") === "error")
|
||||
val xgbEstimatorCopy1 = xgbEstimator.copy(sparkParamMap.put(xgbEstimator.evalMetric, "logloss"))
|
||||
assert(xgbEstimatorCopy1.fromParamsToXGBParamMap("eval_metric") === "logloss")
|
||||
}
|
||||
|
||||
ignore("fast histogram algorithm parameters are exposed correctly") {
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
|
||||
"eval_metric" -> "error")
|
||||
val testItr = Classification.test.iterator
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 10, nWorkers = math.min(2, numWorkers))
|
||||
val error = new EvalError
|
||||
val testSetDMatrix = new DMatrix(testItr)
|
||||
assert(error.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
}
|
||||
|
||||
test("multi_class classification test") {
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
val trainingDF = buildDataFrame(MultiClassification.train)
|
||||
XGBoost.trainWithDataFrame(trainingDF.toDF(), paramMap, round = 5, nWorkers = numWorkers)
|
||||
}
|
||||
|
||||
test("test DF use nested groupData") {
|
||||
val trainingDF = buildDataFrame(Ranking.train0, 1)
|
||||
.union(buildDataFrame(Ranking.train1, 1))
|
||||
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0, Ranking.trainGroup1)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise", "groupData" -> trainGroupData)
|
||||
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = 2)
|
||||
val testDF = buildDataFrame(Ranking.test)
|
||||
val predResultsFromDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("features"))).toMap
|
||||
assert(testDF.count() === predResultsFromDF.size)
|
||||
}
|
||||
|
||||
test("params of estimator and produced model are coordinated correctly") {
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
val trainingDF = buildDataFrame(MultiClassification.train)
|
||||
val model = XGBoost.trainWithDataFrame(trainingDF, paramMap, round = 5, nWorkers = numWorkers)
|
||||
assert(model.get[Double](model.eta).get == 0.1)
|
||||
assert(model.get[Int](model.maxDepth).get == 6)
|
||||
assert(model.asInstanceOf[XGBoostClassificationModel].numOfClasses == 6)
|
||||
}
|
||||
|
||||
test("test use base margin") {
|
||||
import DataUtils._
|
||||
val trainingDf = buildDataFrame(Classification.train)
|
||||
val trainingDfWithMargin = trainingDf.withColumn("margin", functions.rand())
|
||||
val testRDD = sc.parallelize(Classification.test.map(_.features))
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "baseMarginCol" -> "margin",
|
||||
"testTrainSplit" -> 0.5)
|
||||
|
||||
def trainPredict(df: Dataset[_]): Array[Float] = {
|
||||
XGBoost.trainWithDataFrame(df, paramMap, round = 1, nWorkers = numWorkers)
|
||||
.predict(testRDD)
|
||||
.map { case Array(p) => p }
|
||||
.collect()
|
||||
}
|
||||
|
||||
val pred = trainPredict(trainingDf)
|
||||
val predWithMargin = trainPredict(trainingDfWithMargin)
|
||||
assert((pred, predWithMargin).zipped.exists { case (p, pwm) => p !== pwm })
|
||||
}
|
||||
|
||||
test("test use weight") {
|
||||
import DataUtils._
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "weightCol" -> "weight")
|
||||
|
||||
val getWeightFromId = udf({id: Int => if (id == 0) 1.0f else 0.001f}, DataTypes.FloatType)
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
.withColumn("weight", getWeightFromId(col("id")))
|
||||
|
||||
val model = XGBoost.trainWithDataFrame(trainingDF, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = true)
|
||||
.setPredictionCol("final_prediction")
|
||||
.setExternalMemory(true)
|
||||
val testRDD = sc.parallelize(Regression.test.map(_.features))
|
||||
val predictions = model.predict(testRDD).collect().flatten
|
||||
|
||||
// The predictions heavily relies on the first training instance, and thus are very close.
|
||||
predictions.foreach(pred => assert(math.abs(pred - predictions.head) <= 0.01f))
|
||||
}
|
||||
|
||||
test("training summary") {
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
|
||||
val trainingDf = buildDataFrame(Classification.train)
|
||||
val model = XGBoost.trainWithDataFrame(trainingDf, paramMap, round = 5,
|
||||
nWorkers = numWorkers)
|
||||
|
||||
assert(model.summary.trainObjectiveHistory.length === 5)
|
||||
assert(model.summary.testObjectiveHistory.isEmpty)
|
||||
}
|
||||
|
||||
test("train/test split") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "trainTestRatio" -> "0.5")
|
||||
val trainingDf = buildDataFrame(Classification.train)
|
||||
|
||||
forAll(Table("useExternalMemory", false, true)) { useExternalMemory =>
|
||||
val model = XGBoost.trainWithDataFrame(trainingDf, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = useExternalMemory)
|
||||
val Some(testObjectiveHistory) = model.summary.testObjectiveHistory
|
||||
assert(testObjectiveHistory.length === 5)
|
||||
assert(model.summary.trainObjectiveHistory !== testObjectiveHistory)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -18,19 +18,18 @@ package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.nio.file.Files
|
||||
import java.util.concurrent.LinkedBlockingDeque
|
||||
|
||||
import scala.util.Random
|
||||
import ml.dmlc.xgboost4j.java.Rabit
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
|
||||
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.{DenseVector, Vectors, Vector => SparkVector}
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.ml.linalg.Vectors
|
||||
import org.apache.spark.sql._
|
||||
import org.scalatest.FunSuite
|
||||
import scala.util.Random
|
||||
|
||||
class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
|
||||
test("test Rabit allreduce to validate Scala-implemented Rabit tracker") {
|
||||
val vectorLength = 100
|
||||
val rdd = sc.parallelize(
|
||||
@@ -87,283 +86,153 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
test("training with external memory cache") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = true)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"use_external_memory" -> true)
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||
}
|
||||
|
||||
|
||||
test("training with Scala-implemented Rabit tracker") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala")).toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||
}
|
||||
|
||||
|
||||
ignore("test with fast histo depthwise") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "eval_metric" -> "error")
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "depthwise",
|
||||
"eval_metric" -> "error", "num_round" -> 5, "num_workers" -> math.min(numWorkers, 2))
|
||||
// TODO: histogram algorithm seems to be very very sensitive to worker number
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
|
||||
}
|
||||
|
||||
ignore("test with fast histo lossguide") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "error")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist", "grow_policy" -> "lossguide",
|
||||
"max_leaves" -> "8", "eval_metric" -> "error", "num_round" -> 5,
|
||||
"num_workers" -> math.min(numWorkers, 2))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(x < 0.1)
|
||||
}
|
||||
|
||||
ignore("test with fast histo lossguide with max bin") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
|
||||
"eval_metric" -> "error")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
|
||||
"eval_metric" -> "error", "num_round" -> 5, "num_workers" -> math.min(numWorkers, 2))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(x < 0.1)
|
||||
}
|
||||
|
||||
ignore("test with fast histo depthwidth with max depth") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "max_leaves" -> "8", "max_depth" -> "2",
|
||||
"eval_metric" -> "error")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 10,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
"eval_metric" -> "error", "num_round" -> 10, "num_workers" -> math.min(numWorkers, 2))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(x < 0.1)
|
||||
}
|
||||
|
||||
ignore("test with fast histo depthwidth with max depth and max bin") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
|
||||
"eval_metric" -> "error")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 10,
|
||||
nWorkers = math.min(numWorkers, 2))
|
||||
val x = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
"grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
|
||||
"eval_metric" -> "error", "num_round" -> 10, "num_workers" -> math.min(numWorkers, 2))
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(x < 0.1)
|
||||
}
|
||||
|
||||
test("test with dense vectors containing missing value") {
|
||||
def buildDenseRDD(): RDD[MLLabeledPoint] = {
|
||||
test("dense vectors containing missing value") {
|
||||
def buildDenseDataFrame(): DataFrame = {
|
||||
val numRows = 100
|
||||
val numCols = 5
|
||||
|
||||
val labeledPoints = (0 until numRows).map { _ =>
|
||||
val label = Random.nextDouble()
|
||||
val data = (0 until numRows).map { x =>
|
||||
val label = Random.nextInt(2)
|
||||
val values = Array.tabulate[Double](numCols) { c =>
|
||||
if (c == numCols - 1) -0.1 else Random.nextDouble()
|
||||
if (c == numCols - 1) -0.1 else Random.nextDouble
|
||||
}
|
||||
|
||||
MLLabeledPoint(label, Vectors.dense(values))
|
||||
(label, Vectors.dense(values))
|
||||
}
|
||||
|
||||
sc.parallelize(labeledPoints)
|
||||
ss.createDataFrame(sc.parallelize(data.toList)).toDF("label", "features")
|
||||
}
|
||||
|
||||
val trainingRDD = buildDenseRDD().repartition(4)
|
||||
val testRDD = buildDenseRDD().repartition(4).map(_.features.asInstanceOf[DenseVector])
|
||||
val denseDF = buildDenseDataFrame().repartition(4)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers,
|
||||
useExternalMemory = true)
|
||||
xgBoostModel.predict(testRDD, missingValue = -0.1f).collect()
|
||||
}
|
||||
|
||||
test("test consistency of prediction functions with RDD") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSet = Classification.test
|
||||
val testRDD = sc.parallelize(testSet, numSlices = 1).map(_.features)
|
||||
val testCollection = testRDD.collect()
|
||||
for (i <- testSet.indices) {
|
||||
assert(testCollection(i).toDense.values.sameElements(testSet(i).features.toDense.values))
|
||||
}
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
val predRDD = xgBoostModel.predict(testRDD)
|
||||
val predResult1 = predRDD.collect()
|
||||
assert(testRDD.count() === predResult1.length)
|
||||
val predResult2 = xgBoostModel.booster.predict(new DMatrix(testSet.iterator))
|
||||
for (i <- predResult1.indices; j <- predResult1(i).indices) {
|
||||
assert(predResult1(i)(j) === predResult2(i)(j))
|
||||
}
|
||||
}
|
||||
|
||||
test("test eval functions with RDD") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML).cache()
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5, nWorkers = numWorkers)
|
||||
// Nan Zhu: deprecate it for now
|
||||
// xgBoostModel.eval(trainingRDD, "eval1", iter = 5, useExternalCache = false)
|
||||
xgBoostModel.eval(trainingRDD, "eval2", evalFunc = new EvalError, useExternalCache = false)
|
||||
}
|
||||
|
||||
test("test prediction functionality with empty partition") {
|
||||
import DataUtils._
|
||||
def buildEmptyRDD(sparkContext: Option[SparkContext] = None): RDD[SparkVector] = {
|
||||
sparkContext.getOrElse(sc).parallelize(List[SparkVector](), numWorkers)
|
||||
}
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testRDD = buildEmptyRDD()
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
println(xgBoostModel.predict(testRDD).collect().length === 0)
|
||||
}
|
||||
|
||||
test("test use groupData") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Ranking.train0, numSlices = 1).map(_.asML)
|
||||
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0)
|
||||
val testRDD = sc.parallelize(Ranking.test, numSlices = 1).map(_.features)
|
||||
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise", "eval_metric" -> "ndcg", "groupData" -> trainGroupData)
|
||||
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 2, nWorkers = 1)
|
||||
val predRDD = xgBoostModel.predict(testRDD)
|
||||
val predResult1: Array[Array[Float]] = predRDD.collect()
|
||||
assert(testRDD.count() === predResult1.length)
|
||||
|
||||
val avgMetric = xgBoostModel.eval(trainingRDD, "test", iter = 0, groupData = trainGroupData)
|
||||
assert(avgMetric contains "ndcg")
|
||||
// If the labels were lost ndcg comes back as 1.0
|
||||
assert(avgMetric.split('=')(1).toFloat < 1F)
|
||||
}
|
||||
|
||||
test("test use nested groupData") {
|
||||
import DataUtils._
|
||||
val trainingRDD0 = sc.parallelize(Ranking.train0, numSlices = 1)
|
||||
val trainingRDD1 = sc.parallelize(Ranking.train1, numSlices = 1)
|
||||
val trainingRDD = trainingRDD0.union(trainingRDD1).map(_.asML)
|
||||
|
||||
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0, Ranking.trainGroup1)
|
||||
|
||||
val testRDD = sc.parallelize(Ranking.test, numSlices = 1).map(_.features)
|
||||
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise", "groupData" -> trainGroupData)
|
||||
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, nWorkers = 2)
|
||||
val predRDD = xgBoostModel.predict(testRDD)
|
||||
val predResult1: Array[Array[Float]] = predRDD.collect()
|
||||
assert(testRDD.count() === predResult1.length)
|
||||
"objective" -> "binary:logistic", "missing" -> -0.1f, "num_workers" -> numWorkers).toMap
|
||||
val model = new XGBoostClassifier(paramMap).fit(denseDF)
|
||||
model.transform(denseDF).collect()
|
||||
}
|
||||
|
||||
test("training with spark parallelism checks disabled") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "timeout_request_workers" -> 0L).toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
}
|
||||
|
||||
test("isClassificationTask correctly classifies supported objectives") {
|
||||
import org.scalatest.prop.TableDrivenPropertyChecks._
|
||||
|
||||
val objectives = Table(
|
||||
("isClassificationTask", "params"),
|
||||
(true, Map("obj_type" -> "classification")),
|
||||
(false, Map("obj_type" -> "regression")),
|
||||
(false, Map("objective" -> "rank:ndcg")),
|
||||
(false, Map("objective" -> "rank:pairwise")),
|
||||
(false, Map("objective" -> "rank:map")),
|
||||
(false, Map("objective" -> "count:poisson")),
|
||||
(true, Map("objective" -> "binary:logistic")),
|
||||
(true, Map("objective" -> "binary:logitraw")),
|
||||
(true, Map("objective" -> "multi:softmax")),
|
||||
(true, Map("objective" -> "multi:softprob")),
|
||||
(false, Map("objective" -> "reg:linear")),
|
||||
(false, Map("objective" -> "reg:logistic")),
|
||||
(false, Map("objective" -> "reg:gamma")),
|
||||
(false, Map("objective" -> "reg:tweedie")))
|
||||
forAll (objectives) { (isClassificationTask: Boolean, params: Map[String, String]) =>
|
||||
assert(XGBoost.isClassificationTask(params) == isClassificationTask)
|
||||
}
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "timeout_request_workers" -> 0L,
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
val model = new XGBoostClassifier(paramMap).fit(training)
|
||||
val x = eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM)
|
||||
assert(x < 0.1)
|
||||
}
|
||||
|
||||
test("training with checkpoint boosters") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
|
||||
val tmpPath = Files.createTempDirectory("model1").toAbsolutePath.toString
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> 2, "silent" -> "1",
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> 2, "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "checkpoint_path" -> tmpPath,
|
||||
"checkpoint_interval" -> 2).toMap
|
||||
val prevModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers)
|
||||
def error(model: XGBoostModel): Float = eval.eval(
|
||||
model.booster.predict(testSetDMatrix, outPutMargin = true), testSetDMatrix)
|
||||
"checkpoint_interval" -> 2, "num_workers" -> numWorkers)
|
||||
|
||||
val prevModel = new XGBoostClassifier(paramMap ++ Seq("num_round" -> 5)).fit(training)
|
||||
def error(model: Booster): Float = eval.eval(
|
||||
model.predict(testDM, outPutMargin = true), testDM)
|
||||
|
||||
// Check only one model is kept after training
|
||||
val files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
|
||||
assert(files.length == 1)
|
||||
assert(files.head.getPath.getName == "8.model")
|
||||
val tmpModel = XGBoost.loadModelFromHadoopFile(s"$tmpPath/8.model")
|
||||
val tmpModel = SXGBoost.loadModel(s"$tmpPath/8.model")
|
||||
|
||||
// Train next model based on prev model
|
||||
val nextModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 8,
|
||||
nWorkers = numWorkers)
|
||||
assert(error(tmpModel) > error(prevModel))
|
||||
assert(error(prevModel) > error(nextModel))
|
||||
assert(error(nextModel) < 0.1)
|
||||
val nextModel = new XGBoostClassifier(paramMap ++ Seq("num_round" -> 8)).fit(training)
|
||||
assert(error(tmpModel) > error(prevModel._booster))
|
||||
assert(error(prevModel._booster) > error(nextModel._booster))
|
||||
assert(error(nextModel._booster) < 0.1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,133 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.nio.file.Files
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
import org.apache.spark.ml.linalg.Vector
|
||||
import org.apache.spark.ml.param.ParamMap
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostModelSuite extends FunSuite with PerTest {
|
||||
test("test model consistency after save and load") {
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val tempDir = Files.createTempDirectory("xgboosttest-")
|
||||
val tempFile = Files.createTempFile(tempDir, "", "")
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
val evalResults = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
assert(evalResults < 0.1)
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
val loadedXGBooostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
val predicts = loadedXGBooostModel.booster.predict(testSetDMatrix, outPutMargin = true)
|
||||
val loadedEvalResults = eval.eval(predicts, testSetDMatrix)
|
||||
assert(loadedEvalResults == evalResults)
|
||||
}
|
||||
|
||||
test("test save and load of different types of models") {
|
||||
import DataUtils._
|
||||
val tempDir = Files.createTempDirectory("xgboosttest-")
|
||||
val tempFile = Files.createTempFile(tempDir, "", "")
|
||||
var trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
var paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
// validate regression model
|
||||
var xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = false)
|
||||
xgBoostModel.setFeaturesCol("feature_col")
|
||||
xgBoostModel.setLabelCol("label_col")
|
||||
xgBoostModel.setPredictionCol("prediction_col")
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
var loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
assert(loadedXGBoostModel.isInstanceOf[XGBoostRegressionModel])
|
||||
assert(loadedXGBoostModel.getFeaturesCol == "feature_col")
|
||||
assert(loadedXGBoostModel.getLabelCol == "label_col")
|
||||
assert(loadedXGBoostModel.getPredictionCol == "prediction_col")
|
||||
// classification model
|
||||
paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = false)
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("raw_col")
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setThresholds(Array(0.5, 0.5))
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
assert(loadedXGBoostModel.isInstanceOf[XGBoostClassificationModel])
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getRawPredictionCol ==
|
||||
"raw_col")
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getThresholds.deep ==
|
||||
Array(0.5, 0.5).deep)
|
||||
assert(loadedXGBoostModel.getFeaturesCol == "features")
|
||||
assert(loadedXGBoostModel.getLabelCol == "label")
|
||||
assert(loadedXGBoostModel.getPredictionCol == "prediction")
|
||||
// (multiclass) classification model
|
||||
trainingRDD = sc.parallelize(MultiClassification.train).map(_.asML)
|
||||
paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = false)
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("raw_col")
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setThresholds(
|
||||
Array(0.5, 0.5, 0.5, 0.5, 0.5, 0.5))
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
assert(loadedXGBoostModel.isInstanceOf[XGBoostClassificationModel])
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getRawPredictionCol ==
|
||||
"raw_col")
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getThresholds.deep ==
|
||||
Array(0.5, 0.5, 0.5, 0.5, 0.5, 0.5).deep)
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].numOfClasses == 6)
|
||||
assert(loadedXGBoostModel.getFeaturesCol == "features")
|
||||
assert(loadedXGBoostModel.getLabelCol == "label")
|
||||
assert(loadedXGBoostModel.getPredictionCol == "prediction")
|
||||
}
|
||||
|
||||
test("copy and predict ClassificationModel") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testRDD = sc.parallelize(Classification.test).map(_.features)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val model = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
testCopy(model, testRDD)
|
||||
}
|
||||
|
||||
test("copy and predict RegressionModel") {
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Regression.train).map(_.asML)
|
||||
val testRDD = sc.parallelize(Regression.test).map(_.features)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
val model = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
testCopy(model, testRDD)
|
||||
}
|
||||
|
||||
private def testCopy(model: XGBoostModel, testRDD: RDD[Vector]): Unit = {
|
||||
val modelCopy = model.copy(ParamMap.empty)
|
||||
modelCopy.summary // Ensure no exception.
|
||||
|
||||
val expected = model.predict(testRDD).collect
|
||||
assert(modelCopy.predict(testRDD).collect === expected)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.types._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
|
||||
test("XGBoost-Spark XGBoostRegressor ouput should match XGBoost4j: regression") {
|
||||
val trainingDM = new DMatrix(Regression.train.iterator)
|
||||
val testDM = new DMatrix(Regression.test.iterator)
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val round = 5
|
||||
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
|
||||
val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
|
||||
val prediction1 = model1.predict(testDM)
|
||||
|
||||
val model2 = new XGBoostRegressor(paramMap ++ Array("num_round" -> round,
|
||||
"num_workers" -> numWorkers)).fit(trainingDF)
|
||||
|
||||
val prediction2 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[Double]("prediction"))).toMap
|
||||
|
||||
assert(prediction1.indices.count { i =>
|
||||
math.abs(prediction1(i)(0) - prediction2(i)) > 0.01
|
||||
} < prediction1.length * 0.1)
|
||||
}
|
||||
|
||||
test("Set params in XGBoost and MLlib way should produce same model") {
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val round = 5
|
||||
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "reg:linear",
|
||||
"num_round" -> round,
|
||||
"num_workers" -> numWorkers)
|
||||
|
||||
// Set params in XGBoost way
|
||||
val model1 = new XGBoostRegressor(paramMap).fit(trainingDF)
|
||||
// Set params in MLlib way
|
||||
val model2 = new XGBoostRegressor()
|
||||
.setEta(1)
|
||||
.setMaxDepth(6)
|
||||
.setSilent(1)
|
||||
.setObjective("reg:linear")
|
||||
.setNumRound(round)
|
||||
.setNumWorkers(numWorkers)
|
||||
.fit(trainingDF)
|
||||
|
||||
val prediction1 = model1.transform(testDF).select("prediction").collect()
|
||||
val prediction2 = model2.transform(testDF).select("prediction").collect()
|
||||
|
||||
prediction1.zip(prediction2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
|
||||
assert(math.abs(p1 - p2) <= 0.01f)
|
||||
}
|
||||
}
|
||||
|
||||
test("ranking: use group data") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise", "num_workers" -> numWorkers, "num_round" -> 5,
|
||||
"group_col" -> "group")
|
||||
|
||||
val trainingDF = buildDataFrameWithGroup(Ranking.train)
|
||||
val testDF = buildDataFrame(Ranking.test)
|
||||
val model = new XGBoostRegressor(paramMap).fit(trainingDF)
|
||||
|
||||
val prediction = model.transform(testDF).collect()
|
||||
assert(testDF.count() === prediction.length)
|
||||
}
|
||||
|
||||
test("use weight") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
|
||||
val getWeightFromId = udf({id: Int => if (id == 0) 1.0f else 0.001f}, DataTypes.FloatType)
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
.withColumn("weight", getWeightFromId(col("id")))
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
|
||||
val model = new XGBoostRegressor(paramMap).setWeightCol("weight").fit(trainingDF)
|
||||
val prediction = model.transform(testDF).collect()
|
||||
val first = prediction.head.getAs[Double]("prediction")
|
||||
prediction.foreach(x => assert(math.abs(x.getAs[Double]("prediction") - first) <= 0.01f))
|
||||
}
|
||||
}
|
||||
@@ -1,138 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.io.{File, FileNotFoundException}
|
||||
|
||||
import scala.util.Random
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.ml.feature._
|
||||
import org.apache.spark.ml.{Pipeline, PipelineModel}
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
|
||||
class XGBoostSparkPipelinePersistence extends FunSuite with PerTest
|
||||
with BeforeAndAfterAll {
|
||||
|
||||
override def afterAll(): Unit = {
|
||||
delete(new File("./testxgbPipe"))
|
||||
delete(new File("./testxgbEst"))
|
||||
delete(new File("./testxgbModel"))
|
||||
delete(new File("./test2xgbModel"))
|
||||
}
|
||||
|
||||
private def delete(f: File) {
|
||||
if (f.exists()) {
|
||||
if (f.isDirectory()) {
|
||||
for (c <- f.listFiles()) {
|
||||
delete(c)
|
||||
}
|
||||
}
|
||||
if (!f.delete()) {
|
||||
throw new FileNotFoundException("Failed to delete file: " + f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("test persistence of XGBoostEstimator") {
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
val xgbEstimator = new XGBoostEstimator(paramMap)
|
||||
xgbEstimator.write.overwrite().save("./testxgbEst")
|
||||
val loadedxgbEstimator = XGBoostEstimator.read.load("./testxgbEst")
|
||||
val loadedParamMap = loadedxgbEstimator.fromParamsToXGBParamMap
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v == loadedParamMap(k).toString)
|
||||
}
|
||||
}
|
||||
|
||||
test("test persistence of a complete pipeline") {
|
||||
val conf = new SparkConf().setAppName("foo").setMaster("local[*]")
|
||||
val spark = SparkSession.builder().config(conf).getOrCreate()
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6")
|
||||
val r = new Random(0)
|
||||
val assembler = new VectorAssembler().setInputCols(Array("feature")).setOutputCol("features")
|
||||
val xgbEstimator = new XGBoostEstimator(paramMap)
|
||||
val pipeline = new Pipeline().setStages(Array(assembler, xgbEstimator))
|
||||
pipeline.write.overwrite().save("testxgbPipe")
|
||||
val loadedPipeline = Pipeline.read.load("testxgbPipe")
|
||||
val loadedEstimator = loadedPipeline.getStages(1).asInstanceOf[XGBoostEstimator]
|
||||
val loadedParamMap = loadedEstimator.fromParamsToXGBParamMap
|
||||
paramMap.foreach {
|
||||
case (k, v) => assert(v == loadedParamMap(k).toString)
|
||||
}
|
||||
}
|
||||
|
||||
test("test persistence of XGBoostModel") {
|
||||
val conf = new SparkConf().setAppName("foo").setMaster("local[*]")
|
||||
val spark = SparkSession.builder().config(conf).getOrCreate()
|
||||
val r = new Random(0)
|
||||
// maybe move to shared context, but requires session to import implicits
|
||||
val df = spark.createDataFrame(Seq.fill(10000)(r.nextInt(2)).map(i => (i, i))).
|
||||
toDF("feature", "label")
|
||||
val vectorAssembler = new VectorAssembler()
|
||||
.setInputCols(df.columns
|
||||
.filter(!_.contains("label")))
|
||||
.setOutputCol("features")
|
||||
val xgbEstimator = new XGBoostEstimator(Map("num_round" -> 10,
|
||||
"tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala")
|
||||
)).setFeaturesCol("features").setLabelCol("label")
|
||||
// separate
|
||||
val predModel = xgbEstimator.fit(vectorAssembler.transform(df))
|
||||
predModel.write.overwrite.save("test2xgbModel")
|
||||
val same2Model = XGBoostModel.load("test2xgbModel")
|
||||
|
||||
assert(java.util.Arrays.equals(predModel.booster.toByteArray, same2Model.booster.toByteArray))
|
||||
val predParamMap = predModel.extractParamMap()
|
||||
val same2ParamMap = same2Model.extractParamMap()
|
||||
assert(predParamMap.get(predModel.useExternalMemory)
|
||||
=== same2ParamMap.get(same2Model.useExternalMemory))
|
||||
assert(predParamMap.get(predModel.featuresCol) === same2ParamMap.get(same2Model.featuresCol))
|
||||
assert(predParamMap.get(predModel.predictionCol)
|
||||
=== same2ParamMap.get(same2Model.predictionCol))
|
||||
assert(predParamMap.get(predModel.labelCol) === same2ParamMap.get(same2Model.labelCol))
|
||||
assert(predParamMap.get(predModel.labelCol) === same2ParamMap.get(same2Model.labelCol))
|
||||
|
||||
// chained
|
||||
val predictionModel = new Pipeline().setStages(Array(vectorAssembler, xgbEstimator)).fit(df)
|
||||
predictionModel.write.overwrite.save("testxgbModel")
|
||||
val sameModel = PipelineModel.load("testxgbModel")
|
||||
|
||||
val predictionModelXGB = predictionModel.stages.collect { case xgb: XGBoostModel => xgb } head
|
||||
val sameModelXGB = sameModel.stages.collect { case xgb: XGBoostModel => xgb } head
|
||||
|
||||
assert(java.util.Arrays.equals(
|
||||
predictionModelXGB.booster.toByteArray,
|
||||
sameModelXGB.booster.toByteArray
|
||||
))
|
||||
val predictionModelXGBParamMap = predictionModel.extractParamMap()
|
||||
val sameModelXGBParamMap = sameModel.extractParamMap()
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.useExternalMemory)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.useExternalMemory))
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.featuresCol)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.featuresCol))
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.predictionCol)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.predictionCol))
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.labelCol)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.labelCol))
|
||||
assert(predictionModelXGBParamMap.get(predictionModelXGB.labelCol)
|
||||
=== sameModelXGBParamMap.get(sameModelXGB.labelCol))
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user