[jvm-packages] Integration with Spark Dataframe/Dataset (#1559)

* bump up to scala 2.11 * framework of data frame integration * test consistency between RDD and DataFrame * order preservation * test order preservation * example code and fix makefile * improve type checking * improve APIs * user docs * work around travis CI's limitation on log length * adjust test structure * integrate with Spark -1 .x * spark 2.x integration * remove spark 1.x implementation but provide instructions on how to downgrade
2016-09-11 15:02:58 -04:00 · 2016-09-11 15:02:58 -04:00 · fb02797e2a
commit fb02797e2a
parent 7ff742ebf7
15 changed files with 625 additions and 139 deletions
--- a/.gitignore
+++ b/.gitignore
@ -79,3 +79,5 @@ tags
 *.class
 target
 *.swp
+
+.DS_Store
--- a/doc/jvm/index.md
+++ b/doc/jvm/index.md
@ -13,7 +13,7 @@ Before you install XGBoost4J, you need to define environment variable `JAVA_HOME

 After your `JAVA_HOME` is defined correctly, it is as simple as run `mvn package` under jvm-packages directory to install XGBoost4J. You can also skip the tests by running `mvn -DskipTests=true package`, if you are sure about the correctness of your local setup.

-XGBoost4J-Spark which integrates XGBoost with Spark requires to run with Spark 1.6 or newer (the default version is 1.6.1). You can build XGBoost4J-Spark as a component of XGBoost4J by running `mvn package` or specifying the spark version by `mvn -Dspark.version=1.6.0 package`.
+After integrating with Dataframe/Dataset APIs of Spark 2.0, XGBoost4J-Spark only supports compile with Spark 2.x. You can build XGBoost4J-Spark as a component of XGBoost4J by running `mvn package`, and you can specify the version of spark with `mvn -Dspark.version=2.0.0 package`.   (To continue working with Spark 1.x, the users are supposed to update pom.xml by modifying the properties like `spark.version`, `scala.version`, and `scala.binary.version`. Users also need to change the implemention by replacing SparkSession with SQLContext and the type of API parameters from Dataset[_] to Dataframe)

 Contents
 --------
--- a/jvm-packages/README.md
+++ b/jvm-packages/README.md
@ -49,12 +49,17 @@ object XGBoostScalaExample {
 ```

 ### XGBoost Spark
+
+XGBoost4J-Spark supports training XGBoost model through RDD and Dataframe
+
+RDD Version:
+
 ```scala
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.util.MLUtils
 import ml.dmlc.xgboost4j.scala.spark.XGBoost

-object DistTrainWithSpark {
+object SparkWithRDD {
  def main(args: Array[String]): Unit = {
    if (args.length != 3) {
      println(
@ -85,6 +90,52 @@ object DistTrainWithSpark {
 }
 ```

+Dataframe Version:
+
+```scala
+object SparkWithDataFrame {
+  def main(args: Array[String]): Unit = {
+    if (args.length != 5) {
+      println(
+        "usage: program num_of_rounds num_workers training_path test_path model_path")
+      sys.exit(1)
+    }
+    // create SparkSession
+    val sparkConf = new SparkConf().setAppName("XGBoost-spark-example")
+      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    sparkConf.registerKryoClasses(Array(classOf[Booster]))
+    val sparkSession = SparkSession.builder().appName("XGBoost-spark-example").config(sparkConf).
+      getOrCreate()
+    // create training and testing dataframes
+    val inputTrainPath = args(2)
+    val inputTestPath = args(3)
+    val outputModelPath = args(4)
+    // number of iterations
+    val numRound = args(0).toInt
+    import DataUtils._
+    val trainRDDOfRows = MLUtils.loadLibSVMFile(sparkSession.sparkContext, inputTrainPath).
+      map{ labeledPoint => Row(labeledPoint.features, labeledPoint.label)}
+    val trainDF = sparkSession.createDataFrame(trainRDDOfRows, StructType(
+      Array(StructField("features", ArrayType(FloatType)), StructField("label", IntegerType))))
+    val testRDDOfRows = MLUtils.loadLibSVMFile(sparkSession.sparkContext, inputTestPath).
+      zipWithIndex().map{ case (labeledPoint, id) =>
+      Row(id, labeledPoint.features, labeledPoint.label)}
+    val testDF = sparkSession.createDataFrame(testRDDOfRows, StructType(
+      Array(StructField("id", LongType),
+        StructField("features", ArrayType(FloatType)), StructField("label", IntegerType))))
+    // training parameters
+    val paramMap = List(
+      "eta" -> 0.1f,
+      "max_depth" -> 2,
+      "objective" -> "binary:logistic").toMap
+    val xgboostModel = XGBoost.trainWithDataset(
+      trainDF, paramMap, numRound, nWorkers = args(1).toInt, useExternalMemory = true)
+    // xgboost-spark appends the column containing prediction results
+    xgboostModel.transform(testDF).show()
+  }
+}
+```
+
 ### XGBoost Flink
 ```scala
 import ml.dmlc.xgboost4j.scala.flink.XGBoost
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -14,8 +14,6 @@
        <maven.compiler.source>1.7</maven.compiler.source>
        <maven.compiler.target>1.7</maven.compiler.target>
        <maven.version>3.3.9</maven.version>
-        <scala.version>2.10.5</scala.version>
-        <scala.binary.version>2.10</scala.binary.version>
    </properties>
    <modules>
        <module>xgboost4j</module>
@ -25,13 +23,15 @@
    </modules>
    <profiles>
        <profile>
-            <id>spark-1.x</id>
+            <id>spark-2.x</id>
            <activation>
                <activeByDefault>true</activeByDefault>
            </activation>
            <properties>
-                <spark.version>1.6.1</spark.version>
-                <scala.binary.version>2.10</scala.binary.version>
+                <spark.version>2.0.0</spark.version>
+                <flink.suffix>_2.11</flink.suffix>
+                <scala.version>2.11.8</scala.version>
+                <scala.binary.version>2.11</scala.binary.version>
            </properties>
        </profile>
    </profiles>
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithDataFrame.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithDataFrame.scala
@ -0,0 +1,65 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.example.spark
+
+import ml.dmlc.xgboost4j.scala.Booster
+import ml.dmlc.xgboost4j.scala.spark.{XGBoost, DataUtils}
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{SQLContext, Row}
+import org.apache.spark.{SparkContext, SparkConf}
+
+object SparkWithDataFrame {
+  def main(args: Array[String]): Unit = {
+    if (args.length != 5) {
+      println(
+        "usage: program num_of_rounds num_workers training_path test_path model_path")
+      sys.exit(1)
+    }
+    // create SparkSession
+    val sparkConf = new SparkConf().setAppName("XGBoost-spark-example")
+      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    sparkConf.registerKryoClasses(Array(classOf[Booster]))
+    val sqlContext = new SQLContext(new SparkContext(sparkConf))
+    // create training and testing dataframes
+    val inputTrainPath = args(2)
+    val inputTestPath = args(3)
+    val outputModelPath = args(4)
+    // number of iterations
+    val numRound = args(0).toInt
+    import DataUtils._
+    val trainRDDOfRows = MLUtils.loadLibSVMFile(sqlContext.sparkContext, inputTrainPath).
+      map{ labeledPoint => Row(labeledPoint.features, labeledPoint.label)}
+    val trainDF = sqlContext.createDataFrame(trainRDDOfRows, StructType(
+      Array(StructField("features", ArrayType(FloatType)), StructField("label", IntegerType))))
+    val testRDDOfRows = MLUtils.loadLibSVMFile(sqlContext.sparkContext, inputTestPath).
+      zipWithIndex().map{ case (labeledPoint, id) =>
+      Row(id, labeledPoint.features, labeledPoint.label)}
+    val testDF = sqlContext.createDataFrame(testRDDOfRows, StructType(
+      Array(StructField("id", LongType),
+        StructField("features", ArrayType(FloatType)), StructField("label", IntegerType))))
+    // training parameters
+    val paramMap = List(
+      "eta" -> 0.1f,
+      "max_depth" -> 2,
+      "objective" -> "binary:logistic").toMap
+    val xgboostModel = XGBoost.trainWithDataFrame(
+      trainDF, paramMap, numRound, nWorkers = args(1).toInt, useExternalMemory = true)
+    // xgboost-spark appends the column containing prediction results
+    xgboostModel.transform(testDF).show()
+  }
+}
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/DistTrainWithSpark.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/DistTrainWithSpark.scala
@ -21,7 +21,7 @@ import ml.dmlc.xgboost4j.scala.spark.{DataUtils, XGBoost}
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.mllib.util.MLUtils

-object DistTrainWithSpark {
+object SparkWithRDD {
  def main(args: Array[String]): Unit = {
    if (args.length != 5) {
      println(
@ -45,7 +45,7 @@ object DistTrainWithSpark {
      "eta" -> 0.1f,
      "max_depth" -> 2,
      "objective" -> "binary:logistic").toMap
-    val xgboostModel = XGBoost.train(trainRDD, paramMap, numRound, nWorkers = args(1).toInt,
+    val xgboostModel = XGBoost.trainWithRDD(trainRDD, paramMap, numRound, nWorkers = args(1).toInt,
      useExternalMemory = true)
    xgboostModel.booster.predict(new DMatrix(testSet))
    // save model to HDFS path
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@ -35,22 +35,17 @@
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
-            <artifactId>flink-java</artifactId>
+            <artifactId>flink-scala${flink.suffix}</artifactId>
            <version>0.10.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
-            <artifactId>flink-scala</artifactId>
+            <artifactId>flink-clients${flink.suffix}</artifactId>
            <version>0.10.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
-            <artifactId>flink-clients</artifactId>
-            <version>0.10.2</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.flink</groupId>
-            <artifactId>flink-ml</artifactId>
+            <artifactId>flink-ml${flink.suffix}</artifactId>
            <version>0.10.2</version>
        </dependency>
    </dependencies>
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
@ -18,10 +18,9 @@ package ml.dmlc.xgboost4j.scala.spark

 import scala.collection.JavaConverters._

-import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector}
-import org.apache.spark.mllib.regression.{LabeledPoint => SparkLabeledPoint}
-
 import ml.dmlc.xgboost4j.LabeledPoint
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector}
+import org.apache.spark.mllib.regression.{LabeledPoint => SparkLabeledPoint}

 object DataUtils extends Serializable {
  implicit def fromSparkPointsToXGBoostPointsJava(sps: Iterator[SparkLabeledPoint])
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.mllib.linalg.SparseVector
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.{SparkContext, TaskContext}

 object XGBoost extends Serializable {
@ -111,6 +112,33 @@ object XGBoost extends Serializable {
    }.cache()
  }

+  /**
+   *
+   * @param trainingData the trainingset represented as DataFrame
+   * @param params Map containing the parameters to configure XGBoost
+   * @param round the number of iterations
+   * @param nWorkers the number of xgboost workers, 0 by default which means that the number of
+   *                 workers equals to the partition number of trainingData RDD
+   * @param obj the user-defined objective function, null by default
+   * @param eval the user-defined evaluation function, null by default
+   * @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
+   *                           true, the user may save the RAM cost for running XGBoost within Spark
+   * @param missing the value represented the missing value in the dataset
+   * @param inputCol the name of input column, "features" as default value
+   * @param labelCol the name of output column, "label" as default value
+   * @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
+   * @return XGBoostModel when successful training
+   */
+  @throws(classOf[XGBoostError])
+  def trainWithDataFrame(trainingData: Dataset[_],
+                       params: Map[String, Any], round: Int,
+                       nWorkers: Int, obj: ObjectiveTrait = null, eval: EvalTrait = null,
+                       useExternalMemory: Boolean = false, missing: Float = Float.NaN,
+                       inputCol: String = "features", labelCol: String = "label"): XGBoostModel = {
+    new XGBoostEstimator(inputCol, labelCol, params, round, nWorkers, obj, eval,
+      useExternalMemory, missing).fit(trainingData)
+  }
+
  /**
   *
   * @param trainingData the trainingset represented as RDD
@ -127,9 +155,9 @@ object XGBoost extends Serializable {
   * @return XGBoostModel when successful training
   */
  @throws(classOf[XGBoostError])
-  def train(trainingData: RDD[LabeledPoint], configMap: Map[String, Any], round: Int,
-      nWorkers: Int, obj: ObjectiveTrait = null, eval: EvalTrait = null,
-      useExternalMemory: Boolean = false, missing: Float = Float.NaN): XGBoostModel = {
+  def trainWithRDD(trainingData: RDD[LabeledPoint], configMap: Map[String, Any], round: Int,
+                   nWorkers: Int, obj: ObjectiveTrait = null, eval: EvalTrait = null,
+                   useExternalMemory: Boolean = false, missing: Float = Float.NaN): XGBoostModel = {
    require(nWorkers > 0, "you must specify more than 0 workers")
    val tracker = new RabitTracker(nWorkers)
    implicit val sc = trainingData.sparkContext
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala
@ -0,0 +1,81 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import ml.dmlc.xgboost4j.scala.{EvalTrait, ObjectiveTrait}
+import org.apache.spark.ml.{Predictor, Estimator}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.linalg.{VectorUDT, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{NumericType, DoubleType, StructType}
+import org.apache.spark.sql.{DataFrame, TypedColumn, Dataset, Row}
+
+/**
+ * the estimator wrapping XGBoost to produce a training model
+ *
+ * @param inputCol the name of input column
+ * @param labelCol the name of label column
+ * @param xgboostParams the parameters configuring XGBoost
+ * @param round the number of iterations to train
+ * @param nWorkers the total number of workers of xgboost
+ * @param obj the customized objective function, default to be null and using the default in model
+ * @param eval the customized eval function, default to be null and using the default in model
+ * @param useExternalMemory whether to use external memory when training
+ * @param missing the value taken as missing
+ */
+class XGBoostEstimator(
+    inputCol: String, labelCol: String,
+    xgboostParams: Map[String, Any], round: Int, nWorkers: Int,
+    obj: ObjectiveTrait = null,
+    eval: EvalTrait = null, useExternalMemory: Boolean = false, missing: Float = Float.NaN)
+  extends Estimator[XGBoostModel] {
+
+  override val uid: String = Identifiable.randomUID("XGBoostEstimator")
+
+
+  /**
+   * produce a XGBoostModel by fitting the given dataset
+   */
+  def fit(trainingSet: Dataset[_]): XGBoostModel = {
+    val instances = trainingSet.select(
+      col(inputCol), col(labelCol).cast(DoubleType)).rdd.map {
+      case Row(feature: Vector, label: Double) =>
+        LabeledPoint(label, feature)
+    }
+    transformSchema(trainingSet.schema, logging = true)
+    val trainedModel = XGBoost.trainWithRDD(instances, xgboostParams, round, nWorkers, obj,
+      eval, useExternalMemory, missing).setParent(this)
+    copyValues(trainedModel)
+  }
+
+  override def copy(extra: ParamMap): Estimator[XGBoostModel] = {
+    defaultCopy(extra)
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    // check input type, for now we only support vectorUDT as the input feature type
+    val inputType = schema(inputCol).dataType
+    require(inputType.equals(new VectorUDT), s"the type of input column $inputCol has to VectorUDT")
+    // check label Type,
+    val labelType = schema(labelCol).dataType
+    require(labelType.isInstanceOf[NumericType], s"the type of label column $labelCol has to" +
+      s" be NumericType")
+    schema
+  }
+}
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostModel.scala
@ -16,16 +16,28 @@

 package ml.dmlc.xgboost4j.scala.spark

-import org.apache.hadoop.fs.{Path, FileSystem}
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.{TaskContext, SparkContext}
-import org.apache.spark.mllib.linalg.{DenseVector, Vector}
-import org.apache.spark.rdd.RDD
-import ml.dmlc.xgboost4j.java.{Rabit, DMatrix => JDMatrix}
-import ml.dmlc.xgboost4j.scala.{EvalTrait, Booster, DMatrix}
 import scala.collection.JavaConverters._

-class XGBoostModel(_booster: Booster) extends Serializable {
+import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix, Rabit}
+import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait}
+import org.apache.hadoop.fs.Path
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.{Model, PredictionModel}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.linalg.{VectorUDT, DenseVector, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.{SparkContext, TaskContext}
+
+class XGBoostModel(_booster: Booster) extends Model[XGBoostModel] with Serializable {
+
+  var inputCol = "features"
+  var outputCol = "prediction"
+  var outputType: DataType = ArrayType(elementType = FloatType, containsNull = false)

  /**
   * evaluate XGBoostModel with a RDD-wrapped dataset
@ -40,6 +52,7 @@ class XGBoostModel(_booster: Booster) extends Serializable {
      eval: EvalTrait,
      evalName: String,
      useExternalCache: Boolean = false): String = {
+    val broadcastBooster = evalDataset.sparkContext.broadcast(_booster)
    val appName = evalDataset.context.appName
    val allEvalMetrics = evalDataset.mapPartitions {
      labeledPointsPartition =>
@ -55,7 +68,7 @@ class XGBoostModel(_booster: Booster) extends Serializable {
            }
          }
          val dMatrix = new DMatrix(labeledPointsPartition, cacheFileName)
-          val predictions = _booster.predict(dMatrix)
+          val predictions = broadcastBooster.value.predict(dMatrix)
          Rabit.shutdown()
          Iterator(Some(eval.eval(predictions, dMatrix)))
        } else {
@ -152,8 +165,71 @@ class XGBoostModel(_booster: Booster) extends Serializable {
    outputStream.close()
  }

-  /**
-   * Get the booster instance of this model
-   */
  def booster: Booster = _booster
+
+  override val uid: String = Identifiable.randomUID("XGBoostModel")
+
+  override def copy(extra: ParamMap): XGBoostModel = {
+    defaultCopy(extra)
+  }
+
+  /**
+   * produces the prediction results and append as an additional column in the original dataset
+   * NOTE: the prediction results is kept as the original format of xgboost
+   * @return the original dataframe with an additional column containing prediction results
+   */
+  override def transform(testSet: Dataset[_]): DataFrame = {
+    transform(testSet, None)
+  }
+
+  /**
+   * produces the prediction results and append as an additional column in the original dataset
+   * NOTE: the prediction results is transformed by applying the transformation function
+   * predictResultTrans to the original xgboost output
+   * @param predictResultTrans the function to transform xgboost output to the expected format
+   * @return the original dataframe with an additional column containing prediction results
+   */
+  def transform(testSet: Dataset[_], predictResultTrans: Option[Array[Float] => DataType]):
+      DataFrame = {
+    transformSchema(testSet.schema, logging = true)
+    val broadcastBooster = testSet.sqlContext.sparkContext.broadcast(_booster)
+    val instances = testSet.rdd.mapPartitions {
+      rowIterator =>
+        if (rowIterator.hasNext) {
+          val (rowItr1, rowItr2) = rowIterator.duplicate
+          val vectorIterator = rowItr2.map(row => row.asInstanceOf[Row].getAs[Vector](inputCol)).
+            toList.iterator
+          import DataUtils._
+          val testDataset = new DMatrix(vectorIterator, null)
+          val rowPredictResults = broadcastBooster.value.predict(testDataset)
+          val predictResults = {
+            if (predictResultTrans.isDefined) {
+              rowPredictResults.map(prediction => Row(predictResultTrans.get(prediction))).iterator
+            } else {
+              rowPredictResults.map(prediction => Row(prediction)).iterator
+            }
+          }
+          rowItr1.zip(predictResults).map {
+            case (originalColumns: Row, predictColumn: Row) =>
+              Row.fromSeq(originalColumns.toSeq ++ predictColumn.toSeq)
+          }
+        } else {
+          Iterator[Row]()
+        }
+    }
+    testSet.sqlContext.createDataFrame(instances, testSet.schema.add("prediction", outputType)).
+      cache()
+  }
+
+  @DeveloperApi
+  override def transformSchema(schema: StructType): StructType = {
+    if (schema.fieldNames.contains(outputCol)) {
+      throw new IllegalArgumentException(s"Output column $outputCol already exists.")
+    }
+    val inputType = schema(inputCol).dataType
+    require(inputType.equals(new VectorUDT),
+      s"the type of input column $inputCol has to be VectorUDT")
+    val outputFields = schema.fields :+ StructField(outputCol, outputType, nullable = false)
+    StructType(outputFields)
+  }
 }
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/SharedSparkContext.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/SharedSparkContext.scala
@ -0,0 +1,38 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+trait SharedSparkContext extends FunSuite with BeforeAndAfter {
+
+  protected implicit var sc: SparkContext = null
+
+  before {
+    // build SparkContext
+    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
+    sc = new SparkContext(sparkConf)
+    sc.setLogLevel("ERROR")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
@ -0,0 +1,107 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import java.io.File
+
+import scala.collection.mutable.ListBuffer
+import scala.io.Source
+
+import ml.dmlc.xgboost4j.java.XGBoostError
+import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait}
+import org.apache.commons.logging.LogFactory
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.{DenseVector, Vector => SparkVector}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+trait Utils extends SharedSparkContext {
+  protected val numWorkers = Runtime.getRuntime().availableProcessors()
+
+  protected class EvalError extends EvalTrait {
+
+    val logger = LogFactory.getLog(classOf[EvalError])
+
+    private[xgboost4j] var evalMetric: String = "custom_error"
+
+    /**
+     * get evaluate metric
+     *
+     * @return evalMetric
+     */
+    override def getMetric: String = evalMetric
+
+    /**
+     * evaluate with predicts and data
+     *
+     * @param predicts predictions as array
+     * @param dmat     data matrix to evaluate
+     * @return result of the metric
+     */
+    override def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float = {
+      var error: Float = 0f
+      var labels: Array[Float] = null
+      try {
+        labels = dmat.getLabel
+      } catch {
+        case ex: XGBoostError =>
+          logger.error(ex)
+          return -1f
+      }
+      val nrow: Int = predicts.length
+      for (i <- 0 until nrow) {
+        if (labels(i) == 0.0 && predicts(i)(0) > 0) {
+          error += 1
+        } else if (labels(i) == 1.0 && predicts(i)(0) <= 0) {
+          error += 1
+        }
+      }
+      error / labels.length
+    }
+  }
+
+  protected def loadLabelPoints(filePath: String): List[LabeledPoint] = {
+    val file = Source.fromFile(new File(filePath))
+    val sampleList = new ListBuffer[LabeledPoint]
+    for (sample <- file.getLines()) {
+      sampleList += fromSVMStringToLabeledPoint(sample)
+    }
+    sampleList.toList
+  }
+
+  protected def fromSVMStringToLabelAndVector(line: String): (Double, SparkVector) = {
+    val labelAndFeatures = line.split(" ")
+    val label = labelAndFeatures(0).toDouble
+    val features = labelAndFeatures.tail
+    val denseFeature = new Array[Double](129)
+    for (feature <- features) {
+      val idAndValue = feature.split(":")
+      denseFeature(idAndValue(0).toInt) = idAndValue(1).toDouble
+    }
+    (label, new DenseVector(denseFeature))
+  }
+
+  protected def fromSVMStringToLabeledPoint(line: String): LabeledPoint = {
+    val (label, sv) = fromSVMStringToLabelAndVector(line)
+    LabeledPoint(label, sv)
+  }
+
+  protected def buildTrainingRDD(sparkContext: Option[SparkContext] = None): RDD[LabeledPoint] = {
+    val sampleList = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile)
+    sparkContext.getOrElse(sc).parallelize(sampleList, numWorkers)
+  }
+}
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala
@ -0,0 +1,129 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import java.io.File
+
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+import scala.io.Source
+
+import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
+import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.VectorUDT
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.sql._
+import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}
+
+class XGBoostDFSuite extends Utils {
+
+  private def loadRow(filePath: String): List[Row] = {
+    val file = Source.fromFile(new File(filePath))
+    val rowList = new ListBuffer[Row]
+    for (rowLine <- file.getLines()) {
+      rowList += fromSVMStringToRow(rowLine)
+    }
+    rowList.toList
+  }
+
+  private def buildTrainingDataframe(sparkContext: Option[SparkContext] = None):
+      DataFrame = {
+    val rowList = loadRow(getClass.getResource("/agaricus.txt.train").getFile)
+    val rowRDD = sparkContext.getOrElse(sc).parallelize(rowList, numWorkers)
+    val sparkSession = SparkSession.builder().appName("XGBoostDFSuite").getOrCreate()
+    sparkSession.createDataFrame(rowRDD,
+      StructType(Array(StructField("label", DoubleType, nullable = false),
+        StructField("features", new VectorUDT, nullable = false))))
+  }
+
+  private def fromSVMStringToRow(line: String): Row = {
+    val (label, sv) = fromSVMStringToLabelAndVector(line)
+    Row(label, sv)
+  }
+
+  test("test consistency between training with dataframe and RDD") {
+    val trainingDF = buildTrainingDataframe()
+    val trainingRDD = buildTrainingRDD()
+    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
+      "objective" -> "binary:logistic").toMap
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = false)
+    val xgBoostModelWithRDD = XGBoost.trainWithRDD(trainingRDD, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = false)
+    val eval = new EvalError()
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
+    assert(
+      eval.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
+        testSetDMatrix) ===
+        eval.eval(xgBoostModelWithRDD.booster.predict(testSetDMatrix, outPutMargin = true),
+          testSetDMatrix))
+  }
+
+  test("test transform of dataframe-based model") {
+    val trainingDF = buildTrainingDataframe()
+    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
+      "objective" -> "binary:logistic").toMap
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = false)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile)
+    val testRowsRDD = sc.parallelize(testSet.zipWithIndex, numWorkers).map{
+      case (instance: LabeledPoint, id: Int) =>
+        Row(id, instance.features, instance.label)
+    }
+    val testDF = trainingDF.sparkSession.createDataFrame(testRowsRDD, StructType(
+      Array(StructField("id", IntegerType),
+        StructField("features", new VectorUDT), StructField("label", DoubleType))))
+    xgBoostModelWithDF.transform(testDF).show()
+  }
+
+  test("test order preservation of dataframe-based model") {
+    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
+      "objective" -> "binary:logistic").toMap
+    val trainingItr = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile).
+      iterator
+    val (testItr, auxTestItr) =
+      loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator.duplicate
+    import DataUtils._
+    val trainDMatrix = new DMatrix(new JDMatrix(trainingItr, null))
+    val testDMatrix = new DMatrix(new JDMatrix(testItr, null))
+    val xgboostModel = ScalaXGBoost.train(trainDMatrix, paramMap, 5)
+    val predResultFromSeq = xgboostModel.predict(testDMatrix)
+    val testRowsRDD = sc.parallelize(
+      auxTestItr.toList.zipWithIndex, numWorkers).map {
+      case (instance: LabeledPoint, id: Int) =>
+        Row(id, instance.features, instance.label)
+    }
+    val trainingDF = buildTrainingDataframe()
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = false)
+    val testDF = trainingDF.sqlContext.createDataFrame(testRowsRDD, StructType(
+      Array(StructField("id", IntegerType), StructField("features", new VectorUDT),
+        StructField("label", DoubleType))))
+    val predResultsFromDF =
+      xgBoostModelWithDF.transform(testDF).collect().map(row => (row.getAs[Int]("id"),
+        row.getAs[mutable.WrappedArray[Float]]("prediction"))).toMap
+    for (i <- predResultFromSeq.indices) {
+      assert(predResultFromSeq(i).length === predResultsFromDF(i).length)
+      for (j <- predResultFromSeq(i).indices) {
+        assert(predResultFromSeq(i)(j) === predResultsFromDF(i)(j))
+      }
+    }
+  }
+}
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@ -20,107 +20,20 @@ import java.io.File
 import java.nio.file.Files

 import scala.collection.mutable.ListBuffer
-import scala.io.Source
 import scala.util.Random

-import org.apache.commons.logging.LogFactory
-import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors, DenseVector}
+import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix}
+import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost => ScalaXGBoost}
+import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 import org.apache.spark.{SparkConf, SparkContext}
-import org.scalatest.{BeforeAndAfter, FunSuite}

-import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix, XGBoostError}
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait}
-
-class XGBoostSuite extends FunSuite with BeforeAndAfter {
-
-  private implicit var sc: SparkContext = null
-  private val numWorkers = Runtime.getRuntime().availableProcessors()
-
-  private class EvalError extends EvalTrait {
-
-    val logger = LogFactory.getLog(classOf[EvalError])
-
-    private[xgboost4j] var evalMetric: String = "custom_error"
-
-    /**
-     * get evaluate metric
-     *
-     * @return evalMetric
-     */
-    override def getMetric: String = evalMetric
-
-    /**
-     * evaluate with predicts and data
-     *
-     * @param predicts predictions as array
-     * @param dmat     data matrix to evaluate
-     * @return result of the metric
-     */
-    override def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float = {
-      var error: Float = 0f
-      var labels: Array[Float] = null
-      try {
-        labels = dmat.getLabel
-      } catch {
-        case ex: XGBoostError =>
-          logger.error(ex)
-          return -1f
-      }
-      val nrow: Int = predicts.length
-      for (i <- 0 until nrow) {
-        if (labels(i) == 0.0 && predicts(i)(0) > 0) {
-          error += 1
-        } else if (labels(i) == 1.0 && predicts(i)(0) <= 0) {
-          error += 1
-        }
-      }
-      error / labels.length
-    }
-  }
-
-  before {
-    // build SparkContext
-    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
-    sc = new SparkContext(sparkConf)
-  }
-
-  after {
-    if (sc != null) {
-      sc.stop()
-    }
-  }
-
-  private def fromSVMStringToLabeledPoint(line: String): LabeledPoint = {
-    val labelAndFeatures = line.split(" ")
-    val label = labelAndFeatures(0).toInt
-    val features = labelAndFeatures.tail
-    val denseFeature = new Array[Double](129)
-    for (feature <- features) {
-      val idAndValue = feature.split(":")
-      denseFeature(idAndValue(0).toInt) = idAndValue(1).toDouble
-    }
-    LabeledPoint(label, new DenseVector(denseFeature))
-  }
-
-  private def readFile(filePath: String): List[LabeledPoint] = {
-    val file = Source.fromFile(new File(filePath))
-    val sampleList = new ListBuffer[LabeledPoint]
-    for (sample <- file.getLines()) {
-      sampleList += fromSVMStringToLabeledPoint(sample)
-    }
-    sampleList.toList
-  }
-
-  private def buildTrainingRDD(sparkContext: Option[SparkContext] = None): RDD[LabeledPoint] = {
-    val sampleList = readFile(getClass.getResource("/agaricus.txt.train").getFile)
-    sparkContext.getOrElse(sc).parallelize(sampleList, numWorkers)
-  }
+class XGBoostGeneralSuite extends Utils {

  test("build RDD containing boosters with the specified worker number") {
    val trainingRDD = buildTrainingRDD()
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val boosterRDD = XGBoost.buildDistributedBoosters(
@ -145,14 +58,15 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    sc = null
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
    val customSparkContext = new SparkContext(sparkConf)
+    customSparkContext.setLogLevel("ERROR")
    val eval = new EvalError()
    val trainingRDD = buildTrainingRDD(Some(customSparkContext))
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, round = 5,
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
      nWorkers = numWorkers, useExternalMemory = true)
    assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
      testSetDMatrix) < 0.1)
@ -194,13 +108,13 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    val testRDD = buildDenseRDD().repartition(4)
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect()
  }

  test("test consistency of prediction functions with RDD") {
    val trainingRDD = buildTrainingRDD()
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile)
    val testRDD = sc.parallelize(testSet, numSlices = 1).map(_.features)
    val testCollection = testRDD.collect()
    for (i <- testSet.indices) {
@ -208,7 +122,7 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    }
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    val predRDD = xgBoostModel.predict(testRDD)
    val predResult1 = predRDD.collect()(0)
    import DataUtils._
@ -225,26 +139,25 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    }
    val trainingRDD = buildTrainingRDD()
    val testRDD = buildEmptyRDD()
-    import DataUtils._
    val tempDir = Files.createTempDirectory("xgboosttest-")
    val tempFile = Files.createTempFile(tempDir, "", "")
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    println(xgBoostModel.predict(testRDD).collect().length === 0)
  }

  test("test model consistency after save and load") {
    val eval = new EvalError()
    val trainingRDD = buildTrainingRDD()
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val tempDir = Files.createTempDirectory("xgboosttest-")
    val tempFile = Files.createTempFile(tempDir, "", "")
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    val evalResults = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
      testSetDMatrix)
    assert(evalResults < 0.1)
@ -261,12 +174,13 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
      set("spark.task.cpus", "4")
    val customSparkContext = new SparkContext(sparkConf)
+    customSparkContext.setLogLevel("ERROR")
    // start another app
    val trainingRDD = buildTrainingRDD(Some(customSparkContext))
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic", "nthread" -> 6).toMap
    intercept[IllegalArgumentException] {
-      XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+      XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    }
    customSparkContext.stop()
  }
@ -279,13 +193,14 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sparkConf.registerKryoClasses(Array(classOf[Booster]))
    val customSparkContext = new SparkContext(sparkConf)
+    customSparkContext.setLogLevel("ERROR")
    val trainingRDD = buildTrainingRDD(Some(customSparkContext))
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
      testSetDMatrix) < 0.1)
    customSparkContext.stop()