[jvm-packages] Integration with Spark Dataframe/Dataset (#1559)

* bump up to scala 2.11 * framework of data frame integration * test consistency between RDD and DataFrame * order preservation * test order preservation * example code and fix makefile * improve type checking * improve APIs * user docs * work around travis CI's limitation on log length * adjust test structure * integrate with Spark -1 .x * spark 2.x integration * remove spark 1.x implementation but provide instructions on how to downgrade
2016-09-11 15:02:58 -04:00
parent 7ff742ebf7
commit fb02797e2a
15 changed files with 625 additions and 139 deletions
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/SharedSparkContext.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/SharedSparkContext.scala
@@ -0,0 +1,38 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+trait SharedSparkContext extends FunSuite with BeforeAndAfter {
+
+  protected implicit var sc: SparkContext = null
+
+  before {
+    // build SparkContext
+    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
+    sc = new SparkContext(sparkConf)
+    sc.setLogLevel("ERROR")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala
@@ -0,0 +1,107 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import java.io.File
+
+import scala.collection.mutable.ListBuffer
+import scala.io.Source
+
+import ml.dmlc.xgboost4j.java.XGBoostError
+import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait}
+import org.apache.commons.logging.LogFactory
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.{DenseVector, Vector => SparkVector}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+trait Utils extends SharedSparkContext {
+  protected val numWorkers = Runtime.getRuntime().availableProcessors()
+
+  protected class EvalError extends EvalTrait {
+
+    val logger = LogFactory.getLog(classOf[EvalError])
+
+    private[xgboost4j] var evalMetric: String = "custom_error"
+
+    /**
+     * get evaluate metric
+     *
+     * @return evalMetric
+     */
+    override def getMetric: String = evalMetric
+
+    /**
+     * evaluate with predicts and data
+     *
+     * @param predicts predictions as array
+     * @param dmat     data matrix to evaluate
+     * @return result of the metric
+     */
+    override def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float = {
+      var error: Float = 0f
+      var labels: Array[Float] = null
+      try {
+        labels = dmat.getLabel
+      } catch {
+        case ex: XGBoostError =>
+          logger.error(ex)
+          return -1f
+      }
+      val nrow: Int = predicts.length
+      for (i <- 0 until nrow) {
+        if (labels(i) == 0.0 && predicts(i)(0) > 0) {
+          error += 1
+        } else if (labels(i) == 1.0 && predicts(i)(0) <= 0) {
+          error += 1
+        }
+      }
+      error / labels.length
+    }
+  }
+
+  protected def loadLabelPoints(filePath: String): List[LabeledPoint] = {
+    val file = Source.fromFile(new File(filePath))
+    val sampleList = new ListBuffer[LabeledPoint]
+    for (sample <- file.getLines()) {
+      sampleList += fromSVMStringToLabeledPoint(sample)
+    }
+    sampleList.toList
+  }
+
+  protected def fromSVMStringToLabelAndVector(line: String): (Double, SparkVector) = {
+    val labelAndFeatures = line.split(" ")
+    val label = labelAndFeatures(0).toDouble
+    val features = labelAndFeatures.tail
+    val denseFeature = new Array[Double](129)
+    for (feature <- features) {
+      val idAndValue = feature.split(":")
+      denseFeature(idAndValue(0).toInt) = idAndValue(1).toDouble
+    }
+    (label, new DenseVector(denseFeature))
+  }
+
+  protected def fromSVMStringToLabeledPoint(line: String): LabeledPoint = {
+    val (label, sv) = fromSVMStringToLabelAndVector(line)
+    LabeledPoint(label, sv)
+  }
+
+  protected def buildTrainingRDD(sparkContext: Option[SparkContext] = None): RDD[LabeledPoint] = {
+    val sampleList = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile)
+    sparkContext.getOrElse(sc).parallelize(sampleList, numWorkers)
+  }
+}
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala
@@ -0,0 +1,129 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import java.io.File
+
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+import scala.io.Source
+
+import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
+import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.VectorUDT
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.sql._
+import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}
+
+class XGBoostDFSuite extends Utils {
+
+  private def loadRow(filePath: String): List[Row] = {
+    val file = Source.fromFile(new File(filePath))
+    val rowList = new ListBuffer[Row]
+    for (rowLine <- file.getLines()) {
+      rowList += fromSVMStringToRow(rowLine)
+    }
+    rowList.toList
+  }
+
+  private def buildTrainingDataframe(sparkContext: Option[SparkContext] = None):
+      DataFrame = {
+    val rowList = loadRow(getClass.getResource("/agaricus.txt.train").getFile)
+    val rowRDD = sparkContext.getOrElse(sc).parallelize(rowList, numWorkers)
+    val sparkSession = SparkSession.builder().appName("XGBoostDFSuite").getOrCreate()
+    sparkSession.createDataFrame(rowRDD,
+      StructType(Array(StructField("label", DoubleType, nullable = false),
+        StructField("features", new VectorUDT, nullable = false))))
+  }
+
+  private def fromSVMStringToRow(line: String): Row = {
+    val (label, sv) = fromSVMStringToLabelAndVector(line)
+    Row(label, sv)
+  }
+
+  test("test consistency between training with dataframe and RDD") {
+    val trainingDF = buildTrainingDataframe()
+    val trainingRDD = buildTrainingRDD()
+    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
+      "objective" -> "binary:logistic").toMap
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = false)
+    val xgBoostModelWithRDD = XGBoost.trainWithRDD(trainingRDD, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = false)
+    val eval = new EvalError()
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    import DataUtils._
+    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
+    assert(
+      eval.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
+        testSetDMatrix) ===
+        eval.eval(xgBoostModelWithRDD.booster.predict(testSetDMatrix, outPutMargin = true),
+          testSetDMatrix))
+  }
+
+  test("test transform of dataframe-based model") {
+    val trainingDF = buildTrainingDataframe()
+    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
+      "objective" -> "binary:logistic").toMap
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = false)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile)
+    val testRowsRDD = sc.parallelize(testSet.zipWithIndex, numWorkers).map{
+      case (instance: LabeledPoint, id: Int) =>
+        Row(id, instance.features, instance.label)
+    }
+    val testDF = trainingDF.sparkSession.createDataFrame(testRowsRDD, StructType(
+      Array(StructField("id", IntegerType),
+        StructField("features", new VectorUDT), StructField("label", DoubleType))))
+    xgBoostModelWithDF.transform(testDF).show()
+  }
+
+  test("test order preservation of dataframe-based model") {
+    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
+      "objective" -> "binary:logistic").toMap
+    val trainingItr = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile).
+      iterator
+    val (testItr, auxTestItr) =
+      loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator.duplicate
+    import DataUtils._
+    val trainDMatrix = new DMatrix(new JDMatrix(trainingItr, null))
+    val testDMatrix = new DMatrix(new JDMatrix(testItr, null))
+    val xgboostModel = ScalaXGBoost.train(trainDMatrix, paramMap, 5)
+    val predResultFromSeq = xgboostModel.predict(testDMatrix)
+    val testRowsRDD = sc.parallelize(
+      auxTestItr.toList.zipWithIndex, numWorkers).map {
+      case (instance: LabeledPoint, id: Int) =>
+        Row(id, instance.features, instance.label)
+    }
+    val trainingDF = buildTrainingDataframe()
+    val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
+      round = 5, nWorkers = numWorkers, useExternalMemory = false)
+    val testDF = trainingDF.sqlContext.createDataFrame(testRowsRDD, StructType(
+      Array(StructField("id", IntegerType), StructField("features", new VectorUDT),
+        StructField("label", DoubleType))))
+    val predResultsFromDF =
+      xgBoostModelWithDF.transform(testDF).collect().map(row => (row.getAs[Int]("id"),
+        row.getAs[mutable.WrappedArray[Float]]("prediction"))).toMap
+    for (i <- predResultFromSeq.indices) {
+      assert(predResultFromSeq(i).length === predResultsFromDF(i).length)
+      for (j <- predResultFromSeq(i).indices) {
+        assert(predResultFromSeq(i)(j) === predResultsFromDF(i)(j))
+      }
+    }
+  }
+}
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -20,107 +20,20 @@ import java.io.File
 import java.nio.file.Files

 import scala.collection.mutable.ListBuffer
-import scala.io.Source
 import scala.util.Random

-import org.apache.commons.logging.LogFactory
-import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors, DenseVector}
+import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix}
+import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost => ScalaXGBoost}
+import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 import org.apache.spark.{SparkConf, SparkContext}
-import org.scalatest.{BeforeAndAfter, FunSuite}

-import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix, XGBoostError}
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait}
-
-class XGBoostSuite extends FunSuite with BeforeAndAfter {
-
-  private implicit var sc: SparkContext = null
-  private val numWorkers = Runtime.getRuntime().availableProcessors()
-
-  private class EvalError extends EvalTrait {
-
-    val logger = LogFactory.getLog(classOf[EvalError])
-
-    private[xgboost4j] var evalMetric: String = "custom_error"
-
-    /**
-     * get evaluate metric
-     *
-     * @return evalMetric
-     */
-    override def getMetric: String = evalMetric
-
-    /**
-     * evaluate with predicts and data
-     *
-     * @param predicts predictions as array
-     * @param dmat     data matrix to evaluate
-     * @return result of the metric
-     */
-    override def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float = {
-      var error: Float = 0f
-      var labels: Array[Float] = null
-      try {
-        labels = dmat.getLabel
-      } catch {
-        case ex: XGBoostError =>
-          logger.error(ex)
-          return -1f
-      }
-      val nrow: Int = predicts.length
-      for (i <- 0 until nrow) {
-        if (labels(i) == 0.0 && predicts(i)(0) > 0) {
-          error += 1
-        } else if (labels(i) == 1.0 && predicts(i)(0) <= 0) {
-          error += 1
-        }
-      }
-      error / labels.length
-    }
-  }
-
-  before {
-    // build SparkContext
-    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
-    sc = new SparkContext(sparkConf)
-  }
-
-  after {
-    if (sc != null) {
-      sc.stop()
-    }
-  }
-
-  private def fromSVMStringToLabeledPoint(line: String): LabeledPoint = {
-    val labelAndFeatures = line.split(" ")
-    val label = labelAndFeatures(0).toInt
-    val features = labelAndFeatures.tail
-    val denseFeature = new Array[Double](129)
-    for (feature <- features) {
-      val idAndValue = feature.split(":")
-      denseFeature(idAndValue(0).toInt) = idAndValue(1).toDouble
-    }
-    LabeledPoint(label, new DenseVector(denseFeature))
-  }
-
-  private def readFile(filePath: String): List[LabeledPoint] = {
-    val file = Source.fromFile(new File(filePath))
-    val sampleList = new ListBuffer[LabeledPoint]
-    for (sample <- file.getLines()) {
-      sampleList += fromSVMStringToLabeledPoint(sample)
-    }
-    sampleList.toList
-  }
-
-  private def buildTrainingRDD(sparkContext: Option[SparkContext] = None): RDD[LabeledPoint] = {
-    val sampleList = readFile(getClass.getResource("/agaricus.txt.train").getFile)
-    sparkContext.getOrElse(sc).parallelize(sampleList, numWorkers)
-  }
+class XGBoostGeneralSuite extends Utils {

  test("build RDD containing boosters with the specified worker number") {
    val trainingRDD = buildTrainingRDD()
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val boosterRDD = XGBoost.buildDistributedBoosters(
@@ -145,14 +58,15 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    sc = null
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
    val customSparkContext = new SparkContext(sparkConf)
+    customSparkContext.setLogLevel("ERROR")
    val eval = new EvalError()
    val trainingRDD = buildTrainingRDD(Some(customSparkContext))
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, round = 5,
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
      nWorkers = numWorkers, useExternalMemory = true)
    assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
      testSetDMatrix) < 0.1)
@@ -194,13 +108,13 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    val testRDD = buildDenseRDD().repartition(4)
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect()
  }

  test("test consistency of prediction functions with RDD") {
    val trainingRDD = buildTrainingRDD()
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile)
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile)
    val testRDD = sc.parallelize(testSet, numSlices = 1).map(_.features)
    val testCollection = testRDD.collect()
    for (i <- testSet.indices) {
@@ -208,7 +122,7 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    }
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    val predRDD = xgBoostModel.predict(testRDD)
    val predResult1 = predRDD.collect()(0)
    import DataUtils._
@@ -225,26 +139,25 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    }
    val trainingRDD = buildTrainingRDD()
    val testRDD = buildEmptyRDD()
-    import DataUtils._
    val tempDir = Files.createTempDirectory("xgboosttest-")
    val tempFile = Files.createTempFile(tempDir, "", "")
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    println(xgBoostModel.predict(testRDD).collect().length === 0)
  }

  test("test model consistency after save and load") {
    val eval = new EvalError()
    val trainingRDD = buildTrainingRDD()
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val tempDir = Files.createTempDirectory("xgboosttest-")
    val tempFile = Files.createTempFile(tempDir, "", "")
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    val evalResults = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
      testSetDMatrix)
    assert(evalResults < 0.1)
@@ -261,12 +174,13 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
      set("spark.task.cpus", "4")
    val customSparkContext = new SparkContext(sparkConf)
+    customSparkContext.setLogLevel("ERROR")
    // start another app
    val trainingRDD = buildTrainingRDD(Some(customSparkContext))
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic", "nthread" -> 6).toMap
    intercept[IllegalArgumentException] {
-      XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+      XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    }
    customSparkContext.stop()
  }
@@ -279,13 +193,14 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sparkConf.registerKryoClasses(Array(classOf[Booster]))
    val customSparkContext = new SparkContext(sparkConf)
+    customSparkContext.setLogLevel("ERROR")
    val trainingRDD = buildTrainingRDD(Some(customSparkContext))
-    val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
+    val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
    import DataUtils._
    val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
    val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
      "objective" -> "binary:logistic").toMap
-    val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
+    val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
    assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
      testSetDMatrix) < 0.1)
    customSparkContext.stop()