[jvm-packages] separate classification and regression model and integrate with ML package (#1608)

2016-09-30 11:49:03 -04:00
parent 3b9987ca9c
commit 1673bcbe7e
16 changed files with 771 additions and 381 deletions
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/EvalError.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/EvalError.scala
@@ -50,6 +50,8 @@ class EvalError extends EvalTrait {
        logger.error(ex)
        return -1f
    }
+    require(predicts.length == labels.length, s"predicts length ${predicts.length} has to be" +
+      s" equal with label length ${labels.length}")
    val nrow: Int = predicts.length
    for (i <- 0 until nrow) {
      if (labels(i) == 0.0 && predicts(i)(0) > 0) {
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/SharedSparkContext.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/SharedSparkContext.scala
@@ -17,20 +17,21 @@
 package ml.dmlc.xgboost4j.scala.spark

 import org.apache.spark.{SparkConf, SparkContext}
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.{BeforeAndAfterAll, FunSuite}

-class SharedSparkContext extends FunSuite with BeforeAndAfter with Serializable {
+trait SharedSparkContext extends FunSuite with BeforeAndAfterAll with Serializable {

  @transient protected implicit var sc: SparkContext = null

-  before {
+  override def beforeAll() {
    // build SparkContext
-    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
+    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
+      set("spark.driver.memory", "512m")
    sc = new SparkContext(sparkConf)
    sc.setLogLevel("ERROR")
  }

-  after {
+  override def afterAll() {
    if (sc != null) {
      sc.stop()
    }
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
@@ -21,17 +21,23 @@ import java.io.File
 import scala.collection.mutable.ListBuffer
 import scala.io.Source

-import ml.dmlc.xgboost4j.java.XGBoostError
-import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait}
-import org.apache.commons.logging.LogFactory
 import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg.{DenseVector, Vector => SparkVector}
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.linalg.{DenseVector, Vector => SparkVector}
 import org.apache.spark.rdd.RDD

 trait Utils extends Serializable {
  protected val numWorkers = Runtime.getRuntime().availableProcessors()

+  protected var labeledPointsRDD: RDD[LabeledPoint] = null
+
+  protected def cleanExternalCache(prefix: String): Unit = {
+    val dir = new File(".")
+    for (file <- dir.listFiles() if file.getName.startsWith(prefix)) {
+      file.delete()
+    }
+  }
+
  protected def loadLabelPoints(filePath: String): List[LabeledPoint] = {
    val file = Source.fromFile(new File(filePath))
    val sampleList = new ListBuffer[LabeledPoint]
@@ -41,6 +47,15 @@ trait Utils extends Serializable {
    sampleList.toList
  }

+  protected def loadLabelAndVector(filePath: String): List[(Double, SparkVector)] = {
+    val file = Source.fromFile(new File(filePath))
+    val sampleList = new ListBuffer[(Double, SparkVector)]
+    for (sample <- file.getLines()) {
+      sampleList += fromSVMStringToLabelAndVector(sample)
+    }
+    sampleList.toList
+  }
+
  protected def fromSVMStringToLabelAndVector(line: String): (Double, SparkVector) = {
    val labelAndFeatures = line.split(" ")
    val label = labelAndFeatures(0).toDouble
@@ -59,7 +74,10 @@ trait Utils extends Serializable {
  }

  protected def buildTrainingRDD(sparkContext: SparkContext): RDD[LabeledPoint] = {
-    val sampleList = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile)
-    sparkContext.parallelize(sampleList, numWorkers)
+    if (labeledPointsRDD == null) {
+      val sampleList = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile)
+      labeledPointsRDD = sparkContext.parallelize(sampleList, numWorkers)
+    }
+    labeledPointsRDD
  }
 }
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
@@ -0,0 +1,60 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
+import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
+import org.apache.spark.{SparkConf, SparkContext}
+import org.scalatest.FunSuite
+
+class XGBoostConfigureSuite extends FunSuite with Utils {
+
+  test("nthread configuration must be equal to spark.task.cpus") {
+    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
+      set("spark.task.cpus", "4")
+    val customSparkContext = new SparkContext(sparkConf)
+    customSparkContext.setLogLevel("ERROR")
+    // start another app
+    val trainingRDD = buildTrainingRDD(customSparkContext)
+    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
+      "objective" -> "binary:logistic", "nthread" -> 6)
+    intercept[IllegalArgumentException] {
+      XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
+    }
+    customSparkContext.stop()
+  }
+
+  test("kryoSerializer test") {
+    labeledPointsRDD = null
+    val eval = new EvalError()
+    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
+      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    sparkConf.registerKryoClasses(Array(classOf[Booster]))
+    val customSparkContext = new SparkContext(sparkConf)
+    customSparkContext.setLogLevel("ERROR")
+    val trainingRDD = buildTrainingRDD(customSparkContext)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
+    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
+      "objective" -> "binary:logistic")
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
+    assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
+      testSetDMatrix) < 0.1)
+    customSparkContext.stop()
+  }
+}
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala
@@ -25,77 +25,27 @@ import scala.io.Source
 import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
 import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
 import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg.VectorUDT
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.sql._
-import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}

 class XGBoostDFSuite extends SharedSparkContext with Utils {

-  private def loadRow(filePath: String): List[Row] = {
-    val file = Source.fromFile(new File(filePath))
-    val rowList = new ListBuffer[Row]
-    for (rowLine <- file.getLines()) {
-      rowList += fromSVMStringToRow(rowLine)
+  private var trainingDF: DataFrame = null
+
+  private def buildTrainingDataframe(sparkContext: Option[SparkContext] = None): DataFrame = {
+    if (trainingDF == null) {
+      val rowList = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile)
+      val labeledPointsRDD = sparkContext.getOrElse(sc).parallelize(rowList, numWorkers)
+      val sparkSession = SparkSession.builder().appName("XGBoostDFSuite").getOrCreate()
+      import sparkSession.implicits._
+      trainingDF = sparkSession.createDataset(labeledPointsRDD).toDF
    }
-    rowList.toList
+    trainingDF
  }

-  private def buildTrainingDataframe(sparkContext: Option[SparkContext] = None):
-      DataFrame = {
-    val rowList = loadRow(getClass.getResource("/agaricus.txt.train").getFile)
-    val rowRDD = sparkContext.getOrElse(sc).parallelize(rowList, numWorkers)
-    val sparkSession = SparkSession.builder().appName("XGBoostDFSuite").getOrCreate()
-    sparkSession.createDataFrame(rowRDD,
-      StructType(Array(StructField("label", DoubleType, nullable = false),
-        StructField("features", new VectorUDT, nullable = false))))
-  }
-
-  private def fromSVMStringToRow(line: String): Row = {
-    val (label, sv) = fromSVMStringToLabelAndVector(line)
-    Row(label, sv)
-  }
-
-  test("test consistency between training with dataframe and RDD") {
-    val trainingDF = buildTrainingDataframe()
-    val trainingRDD = buildTrainingRDD(sc)
-    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
-      "objective" -> "binary:logistic").toMap
-    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
-      round = 5, nWorkers = numWorkers, useExternalMemory = false)
-    val xgBoostModelWithRDD = XGBoost.trainWithRDD(trainingRDD, paramMap,
-      round = 5, nWorkers = numWorkers, useExternalMemory = false)
-    val eval = new EvalError()
-    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
-    import DataUtils._
-    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
-    assert(
-      eval.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
-        testSetDMatrix) ===
-        eval.eval(xgBoostModelWithRDD.booster.predict(testSetDMatrix, outPutMargin = true),
-          testSetDMatrix))
-  }
-
-  test("test transform of dataframe-based model") {
-    val trainingDF = buildTrainingDataframe()
-    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
-      "objective" -> "binary:logistic").toMap
-    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
-      round = 5, nWorkers = numWorkers, useExternalMemory = false)
-    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile)
-    val testRowsRDD = sc.parallelize(testSet.zipWithIndex, numWorkers).map{
-      case (instance: LabeledPoint, id: Int) =>
-        Row(id, instance.features, instance.label)
-    }
-    val testDF = trainingDF.sparkSession.createDataFrame(testRowsRDD, StructType(
-      Array(StructField("id", IntegerType),
-        StructField("features", new VectorUDT), StructField("label", DoubleType))))
-    xgBoostModelWithDF.transform(testDF).show()
-  }
-
-  test("test order preservation of dataframe-based model") {
-    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
-      "objective" -> "binary:logistic").toMap
+  test("test consistency and order preservation of dataframe-based model") {
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "binary:logistic")
    val trainingItr = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile).
      iterator
    val (testItr, auxTestItr) =
@@ -105,25 +55,109 @@ class XGBoostDFSuite extends SharedSparkContext with Utils {
    val testDMatrix = new DMatrix(new JDMatrix(testItr, null))
    val xgboostModel = ScalaXGBoost.train(trainDMatrix, paramMap, 5)
    val predResultFromSeq = xgboostModel.predict(testDMatrix)
-    val testRowsRDD = sc.parallelize(
-      auxTestItr.toList.zipWithIndex, numWorkers).map {
+    val testSetItr = auxTestItr.zipWithIndex.map {
      case (instance: LabeledPoint, id: Int) =>
-        Row(id, instance.features, instance.label)
+        (id, instance.features, instance.label)
    }
    val trainingDF = buildTrainingDataframe()
    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
      round = 5, nWorkers = numWorkers, useExternalMemory = false)
-    val testDF = trainingDF.sqlContext.createDataFrame(testRowsRDD, StructType(
-      Array(StructField("id", IntegerType), StructField("features", new VectorUDT),
-        StructField("label", DoubleType))))
-    val predResultsFromDF =
-      xgBoostModelWithDF.transform(testDF).collect().map(row => (row.getAs[Int]("id"),
-        row.getAs[mutable.WrappedArray[Float]]("prediction"))).toMap
+    val testDF = trainingDF.sparkSession.createDataFrame(testSetItr.toList).toDF(
+      "id", "features", "label")
+    val predResultsFromDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF).
+      collect().map(row =>
+      (row.getAs[Int]("id"), row.getAs[mutable.WrappedArray[Float]]("probabilities"))
+    ).toMap
+    assert(testDF.count() === predResultsFromDF.size)
    for (i <- predResultFromSeq.indices) {
      assert(predResultFromSeq(i).length === predResultsFromDF(i).length)
      for (j <- predResultFromSeq(i).indices) {
        assert(predResultFromSeq(i)(j) === predResultsFromDF(i)(j))
      }
    }
+    cleanExternalCache("XGBoostDFSuite")
+  }
+
+  test("test transformLeaf") {
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "binary:logistic")
+    val testItr = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val trainingDF = buildTrainingDataframe()
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = false)
+    val testSetItr = testItr.zipWithIndex.map {
+      case (instance: LabeledPoint, id: Int) =>
+        (id, instance.features, instance.label)
+    }
+    val testDF = trainingDF.sparkSession.createDataFrame(testSetItr.toList).toDF(
+      "id", "features", "label")
+    xgBoostModelWithDF.transformLeaf(testDF).show()
+  }
+
+  test("test schema of XGBoostRegressionModel") {
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "reg:linear")
+    val testItr = loadLabelPoints(getClass.getResource("/machine.txt.test").getFile).iterator.
+      zipWithIndex.map { case (instance: LabeledPoint, id: Int) =>
+      (id, instance.features, instance.label)
+    }
+    val trainingDF = {
+      val rowList = loadLabelPoints(getClass.getResource("/machine.txt.train").getFile)
+      val labeledPointsRDD = sc.parallelize(rowList, numWorkers)
+      val sparkSession = SparkSession.builder().appName("XGBoostDFSuite").getOrCreate()
+      import sparkSession.implicits._
+      sparkSession.createDataset(labeledPointsRDD).toDF
+    }
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = true)
+    xgBoostModelWithDF.setPredictionCol("final_prediction")
+    val testDF = trainingDF.sparkSession.createDataFrame(testItr.toList).toDF(
+      "id", "features", "label")
+    val predictionDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF)
+    assert(predictionDF.columns.contains("id") === true)
+    assert(predictionDF.columns.contains("features") === true)
+    assert(predictionDF.columns.contains("label") === true)
+    assert(predictionDF.columns.contains("final_prediction") === true)
+    predictionDF.show()
+    cleanExternalCache("XGBoostDFSuite")
+  }
+
+  test("test schema of XGBoostClassificationModel") {
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "binary:logistic")
+    val testItr = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator.
+      zipWithIndex.map { case (instance: LabeledPoint, id: Int) =>
+      (id, instance.features, instance.label)
+    }
+    val trainingDF = buildTrainingDataframe()
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = true)
+    xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol(
+      "raw_prediction").setPredictionCol("final_prediction")
+    val testDF = trainingDF.sparkSession.createDataFrame(testItr.toList).toDF(
+      "id", "features", "label")
+    var predictionDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF)
+    assert(predictionDF.columns.contains("id") === true)
+    assert(predictionDF.columns.contains("features") === true)
+    assert(predictionDF.columns.contains("label") === true)
+    assert(predictionDF.columns.contains("raw_prediction") === true)
+    assert(predictionDF.columns.contains("final_prediction") === true)
+    xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("").
+      setPredictionCol("final_prediction")
+    predictionDF = xgBoostModelWithDF.transform(testDF)
+    assert(predictionDF.columns.contains("id") === true)
+    assert(predictionDF.columns.contains("features") === true)
+    assert(predictionDF.columns.contains("label") === true)
+    assert(predictionDF.columns.contains("raw_prediction") === false)
+    assert(predictionDF.columns.contains("final_prediction") === true)
+    xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].
+      setRawPredictionCol("raw_prediction").setPredictionCol("")
+    predictionDF = xgBoostModelWithDF.transform(testDF)
+    assert(predictionDF.columns.contains("id") === true)
+    assert(predictionDF.columns.contains("features") === true)
+    assert(predictionDF.columns.contains("label") === true)
+    assert(predictionDF.columns.contains("raw_prediction") === true)
+    assert(predictionDF.columns.contains("final_prediction") === false)
+    cleanExternalCache("XGBoostDFSuite")
  }
 }
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -16,66 +16,47 @@

 package ml.dmlc.xgboost4j.scala.spark

-import java.io.File
 import java.nio.file.Files

 import scala.collection.mutable.ListBuffer
 import scala.util.Random

-import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix}
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost => ScalaXGBoost}
-import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors}
-import org.apache.spark.mllib.regression.LabeledPoint
+import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
+import ml.dmlc.xgboost4j.scala.DMatrix
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.linalg.{Vector => SparkVector, Vectors}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.{SparkConf, SparkContext}

 class XGBoostGeneralSuite extends SharedSparkContext with Utils {

  test("build RDD containing boosters with the specified worker number") {
    val trainingRDD = buildTrainingRDD(sc)
-    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
-    import DataUtils._
-    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val boosterRDD = XGBoost.buildDistributedBoosters(
      trainingRDD,
-      List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
+      List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
        "objective" -> "binary:logistic").toMap,
      new scala.collection.mutable.HashMap[String, String],
-      numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = false)
+      numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = true)
    val boosterCount = boosterRDD.count()
    assert(boosterCount === 2)
-    val boosters = boosterRDD.collect()
-    val eval = new EvalError()
-    for (booster <- boosters) {
-      // the threshold is 0.11 because it does not sync boosters with AllReduce
-      val predicts = booster.predict(testSetDMatrix, outPutMargin = true)
-      assert(eval.eval(predicts, testSetDMatrix) < 0.11)
-    }
+    cleanExternalCache("XGBoostSuite")
  }

  test("training with external memory cache") {
-    sc.stop()
-    sc = null
-    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
-    val customSparkContext = new SparkContext(sparkConf)
-    customSparkContext.setLogLevel("ERROR")
    val eval = new EvalError()
-    val trainingRDD = buildTrainingRDD(customSparkContext)
+    val trainingRDD = buildTrainingRDD(sc)
    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
-    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
+    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
      "objective" -> "binary:logistic").toMap
    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
      nWorkers = numWorkers, useExternalMemory = true)
    assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
      testSetDMatrix) < 0.1)
-    customSparkContext.stop()
    // clean
-    val dir = new File(".")
-    for (file <- dir.listFiles() if file.getName.startsWith("XGBoostSuite-0-dtrain_cache")) {
-      file.delete()
-    }
+    cleanExternalCache("XGBoostSuite")
  }

  test("test with dense vectors containing missing value") {
@@ -106,10 +87,13 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
    }
    val trainingRDD = buildDenseRDD().repartition(4)
    val testRDD = buildDenseRDD().repartition(4)
-    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
+    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers,
+      useExternalMemory = true)
    xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect()
+    // clean
+    cleanExternalCache("XGBoostSuite")
  }

  test("test consistency of prediction functions with RDD") {
@@ -120,11 +104,12 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
    for (i <- testSet.indices) {
      assert(testCollection(i).toDense.values.sameElements(testSet(i).features.toDense.values))
    }
-    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
-      "objective" -> "binary:logistic").toMap
+    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
+      "objective" -> "binary:logistic")
    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    val predRDD = xgBoostModel.predict(testRDD)
    val predResult1 = predRDD.collect()(0)
+    assert(testRDD.count() === predResult1.length)
    import DataUtils._
    val predResult2 = xgBoostModel.booster.predict(new DMatrix(testSet.iterator))
    for (i <- predResult1.indices; j <- predResult1(i).indices) {
@@ -134,9 +119,9 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {

  test("test eval functions with RDD") {
    val trainingRDD = buildTrainingRDD(sc).cache()
-    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
-      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
+    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
+      "objective" -> "binary:logistic")
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5, nWorkers = numWorkers)
    xgBoostModel.eval(trainingRDD, "eval1", iter = 5, useExternalCache = false)
    xgBoostModel.eval(trainingRDD, "eval2", evalFunc = new EvalError, useExternalCache = false)
  }
@@ -150,7 +135,7 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
    val testRDD = buildEmptyRDD()
    val tempDir = Files.createTempDirectory("xgboosttest-")
    val tempFile = Files.createTempFile(tempDir, "", "")
-    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
+    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
      "objective" -> "binary:logistic").toMap
    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    println(xgBoostModel.predict(testRDD).collect().length === 0)
@@ -164,8 +149,8 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val tempDir = Files.createTempDirectory("xgboosttest-")
    val tempFile = Files.createTempFile(tempDir, "", "")
-    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
-      "objective" -> "binary:logistic").toMap
+    val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
+      "objective" -> "binary:logistic")
    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    val evalResults = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
      testSetDMatrix)
@@ -177,41 +162,40 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
    assert(loadedEvalResults == evalResults)
  }

-  test("nthread configuration must be equal to spark.task.cpus") {
-    sc.stop()
-    sc = null
-    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
-      set("spark.task.cpus", "4")
-    val customSparkContext = new SparkContext(sparkConf)
-    customSparkContext.setLogLevel("ERROR")
-    // start another app
-    val trainingRDD = buildTrainingRDD(customSparkContext)
-    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
-      "objective" -> "binary:logistic", "nthread" -> 6).toMap
-    intercept[IllegalArgumentException] {
-      XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
-    }
-    customSparkContext.stop()
-  }
-
-  test("kryoSerializer test") {
-    sc.stop()
-    sc = null
-    val eval = new EvalError()
-    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
-      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-    sparkConf.registerKryoClasses(Array(classOf[Booster]))
-    val customSparkContext = new SparkContext(sparkConf)
-    customSparkContext.setLogLevel("ERROR")
-    val trainingRDD = buildTrainingRDD(customSparkContext)
-    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
-    import DataUtils._
-    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
-    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
-      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
-    assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
-      testSetDMatrix) < 0.1)
-    customSparkContext.stop()
+  test("test save and load of different types of models") {
+    val tempDir = Files.createTempDirectory("xgboosttest-")
+    val tempFile = Files.createTempFile(tempDir, "", "")
+    val trainingRDD = buildTrainingRDD(sc)
+    var paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "reg:linear")
+    // validate regression model
+    var xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
+      nWorkers = numWorkers, useExternalMemory = false)
+    xgBoostModel.setFeaturesCol("feature_col")
+    xgBoostModel.setLabelCol("label_col")
+    xgBoostModel.setPredictionCol("prediction_col")
+    xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
+    var loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
+    assert(loadedXGBoostModel.isInstanceOf[XGBoostRegressionModel])
+    assert(loadedXGBoostModel.getFeaturesCol == "feature_col")
+    assert(loadedXGBoostModel.getLabelCol == "label_col")
+    assert(loadedXGBoostModel.getPredictionCol == "prediction_col")
+    // classification model
+    paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "binary:logistic")
+    xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
+      nWorkers = numWorkers, useExternalMemory = false)
+    xgBoostModel.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("raw_col")
+    xgBoostModel.asInstanceOf[XGBoostClassificationModel].setThresholds(Array(0.5, 0.5))
+    xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
+    loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
+    assert(loadedXGBoostModel.isInstanceOf[XGBoostClassificationModel])
+    assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getRawPredictionCol ==
+      "raw_col")
+    assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getThresholds.deep ==
+      Array(0.5, 0.5).deep)
+    assert(loadedXGBoostModel.getFeaturesCol == "features")
+    assert(loadedXGBoostModel.getLabelCol == "label")
+    assert(loadedXGBoostModel.getPredictionCol == "prediction")
  }
 }