[jvm-packages] Integration with Spark Dataframe/Dataset (#1559)

* bump up to scala 2.11

* framework of data frame integration

* test consistency between RDD and DataFrame

* order preservation

* test order preservation

* example code and fix makefile

* improve type checking

* improve APIs

* user docs

* work around travis CI's limitation on log length

* adjust test structure

* integrate with Spark -1 .x

* spark 2.x integration

* remove spark 1.x implementation but provide instructions on how to downgrade
This commit is contained in:
Nan Zhu
2016-09-11 15:02:58 -04:00
committed by GitHub
parent 7ff742ebf7
commit fb02797e2a
15 changed files with 625 additions and 139 deletions

View File

@@ -0,0 +1,38 @@
/*
Copyright (c) 2014 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.spark
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfter, FunSuite}
trait SharedSparkContext extends FunSuite with BeforeAndAfter {
protected implicit var sc: SparkContext = null
before {
// build SparkContext
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
sc = new SparkContext(sparkConf)
sc.setLogLevel("ERROR")
}
after {
if (sc != null) {
sc.stop()
}
}
}

View File

@@ -0,0 +1,107 @@
/*
Copyright (c) 2014 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.spark
import java.io.File
import scala.collection.mutable.ListBuffer
import scala.io.Source
import ml.dmlc.xgboost4j.java.XGBoostError
import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait}
import org.apache.commons.logging.LogFactory
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{DenseVector, Vector => SparkVector}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
trait Utils extends SharedSparkContext {
protected val numWorkers = Runtime.getRuntime().availableProcessors()
protected class EvalError extends EvalTrait {
val logger = LogFactory.getLog(classOf[EvalError])
private[xgboost4j] var evalMetric: String = "custom_error"
/**
* get evaluate metric
*
* @return evalMetric
*/
override def getMetric: String = evalMetric
/**
* evaluate with predicts and data
*
* @param predicts predictions as array
* @param dmat data matrix to evaluate
* @return result of the metric
*/
override def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float = {
var error: Float = 0f
var labels: Array[Float] = null
try {
labels = dmat.getLabel
} catch {
case ex: XGBoostError =>
logger.error(ex)
return -1f
}
val nrow: Int = predicts.length
for (i <- 0 until nrow) {
if (labels(i) == 0.0 && predicts(i)(0) > 0) {
error += 1
} else if (labels(i) == 1.0 && predicts(i)(0) <= 0) {
error += 1
}
}
error / labels.length
}
}
protected def loadLabelPoints(filePath: String): List[LabeledPoint] = {
val file = Source.fromFile(new File(filePath))
val sampleList = new ListBuffer[LabeledPoint]
for (sample <- file.getLines()) {
sampleList += fromSVMStringToLabeledPoint(sample)
}
sampleList.toList
}
protected def fromSVMStringToLabelAndVector(line: String): (Double, SparkVector) = {
val labelAndFeatures = line.split(" ")
val label = labelAndFeatures(0).toDouble
val features = labelAndFeatures.tail
val denseFeature = new Array[Double](129)
for (feature <- features) {
val idAndValue = feature.split(":")
denseFeature(idAndValue(0).toInt) = idAndValue(1).toDouble
}
(label, new DenseVector(denseFeature))
}
protected def fromSVMStringToLabeledPoint(line: String): LabeledPoint = {
val (label, sv) = fromSVMStringToLabelAndVector(line)
LabeledPoint(label, sv)
}
protected def buildTrainingRDD(sparkContext: Option[SparkContext] = None): RDD[LabeledPoint] = {
val sampleList = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile)
sparkContext.getOrElse(sc).parallelize(sampleList, numWorkers)
}
}

View File

@@ -0,0 +1,129 @@
/*
Copyright (c) 2014 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.spark
import java.io.File
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.Source
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.VectorUDT
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql._
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}
class XGBoostDFSuite extends Utils {
private def loadRow(filePath: String): List[Row] = {
val file = Source.fromFile(new File(filePath))
val rowList = new ListBuffer[Row]
for (rowLine <- file.getLines()) {
rowList += fromSVMStringToRow(rowLine)
}
rowList.toList
}
private def buildTrainingDataframe(sparkContext: Option[SparkContext] = None):
DataFrame = {
val rowList = loadRow(getClass.getResource("/agaricus.txt.train").getFile)
val rowRDD = sparkContext.getOrElse(sc).parallelize(rowList, numWorkers)
val sparkSession = SparkSession.builder().appName("XGBoostDFSuite").getOrCreate()
sparkSession.createDataFrame(rowRDD,
StructType(Array(StructField("label", DoubleType, nullable = false),
StructField("features", new VectorUDT, nullable = false))))
}
private def fromSVMStringToRow(line: String): Row = {
val (label, sv) = fromSVMStringToLabelAndVector(line)
Row(label, sv)
}
test("test consistency between training with dataframe and RDD") {
val trainingDF = buildTrainingDataframe()
val trainingRDD = buildTrainingRDD()
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
round = 5, nWorkers = numWorkers, useExternalMemory = false)
val xgBoostModelWithRDD = XGBoost.trainWithRDD(trainingRDD, paramMap,
round = 5, nWorkers = numWorkers, useExternalMemory = false)
val eval = new EvalError()
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
import DataUtils._
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
assert(
eval.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
testSetDMatrix) ===
eval.eval(xgBoostModelWithRDD.booster.predict(testSetDMatrix, outPutMargin = true),
testSetDMatrix))
}
test("test transform of dataframe-based model") {
val trainingDF = buildTrainingDataframe()
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
round = 5, nWorkers = numWorkers, useExternalMemory = false)
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile)
val testRowsRDD = sc.parallelize(testSet.zipWithIndex, numWorkers).map{
case (instance: LabeledPoint, id: Int) =>
Row(id, instance.features, instance.label)
}
val testDF = trainingDF.sparkSession.createDataFrame(testRowsRDD, StructType(
Array(StructField("id", IntegerType),
StructField("features", new VectorUDT), StructField("label", DoubleType))))
xgBoostModelWithDF.transform(testDF).show()
}
test("test order preservation of dataframe-based model") {
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val trainingItr = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile).
iterator
val (testItr, auxTestItr) =
loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator.duplicate
import DataUtils._
val trainDMatrix = new DMatrix(new JDMatrix(trainingItr, null))
val testDMatrix = new DMatrix(new JDMatrix(testItr, null))
val xgboostModel = ScalaXGBoost.train(trainDMatrix, paramMap, 5)
val predResultFromSeq = xgboostModel.predict(testDMatrix)
val testRowsRDD = sc.parallelize(
auxTestItr.toList.zipWithIndex, numWorkers).map {
case (instance: LabeledPoint, id: Int) =>
Row(id, instance.features, instance.label)
}
val trainingDF = buildTrainingDataframe()
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
round = 5, nWorkers = numWorkers, useExternalMemory = false)
val testDF = trainingDF.sqlContext.createDataFrame(testRowsRDD, StructType(
Array(StructField("id", IntegerType), StructField("features", new VectorUDT),
StructField("label", DoubleType))))
val predResultsFromDF =
xgBoostModelWithDF.transform(testDF).collect().map(row => (row.getAs[Int]("id"),
row.getAs[mutable.WrappedArray[Float]]("prediction"))).toMap
for (i <- predResultFromSeq.indices) {
assert(predResultFromSeq(i).length === predResultsFromDF(i).length)
for (j <- predResultFromSeq(i).indices) {
assert(predResultFromSeq(i)(j) === predResultsFromDF(i)(j))
}
}
}
}

View File

@@ -20,107 +20,20 @@ import java.io.File
import java.nio.file.Files
import scala.collection.mutable.ListBuffer
import scala.io.Source
import scala.util.Random
import org.apache.commons.logging.LogFactory
import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors, DenseVector}
import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix}
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost => ScalaXGBoost}
import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfter, FunSuite}
import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix, XGBoostError}
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait}
class XGBoostSuite extends FunSuite with BeforeAndAfter {
private implicit var sc: SparkContext = null
private val numWorkers = Runtime.getRuntime().availableProcessors()
private class EvalError extends EvalTrait {
val logger = LogFactory.getLog(classOf[EvalError])
private[xgboost4j] var evalMetric: String = "custom_error"
/**
* get evaluate metric
*
* @return evalMetric
*/
override def getMetric: String = evalMetric
/**
* evaluate with predicts and data
*
* @param predicts predictions as array
* @param dmat data matrix to evaluate
* @return result of the metric
*/
override def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float = {
var error: Float = 0f
var labels: Array[Float] = null
try {
labels = dmat.getLabel
} catch {
case ex: XGBoostError =>
logger.error(ex)
return -1f
}
val nrow: Int = predicts.length
for (i <- 0 until nrow) {
if (labels(i) == 0.0 && predicts(i)(0) > 0) {
error += 1
} else if (labels(i) == 1.0 && predicts(i)(0) <= 0) {
error += 1
}
}
error / labels.length
}
}
before {
// build SparkContext
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
sc = new SparkContext(sparkConf)
}
after {
if (sc != null) {
sc.stop()
}
}
private def fromSVMStringToLabeledPoint(line: String): LabeledPoint = {
val labelAndFeatures = line.split(" ")
val label = labelAndFeatures(0).toInt
val features = labelAndFeatures.tail
val denseFeature = new Array[Double](129)
for (feature <- features) {
val idAndValue = feature.split(":")
denseFeature(idAndValue(0).toInt) = idAndValue(1).toDouble
}
LabeledPoint(label, new DenseVector(denseFeature))
}
private def readFile(filePath: String): List[LabeledPoint] = {
val file = Source.fromFile(new File(filePath))
val sampleList = new ListBuffer[LabeledPoint]
for (sample <- file.getLines()) {
sampleList += fromSVMStringToLabeledPoint(sample)
}
sampleList.toList
}
private def buildTrainingRDD(sparkContext: Option[SparkContext] = None): RDD[LabeledPoint] = {
val sampleList = readFile(getClass.getResource("/agaricus.txt.train").getFile)
sparkContext.getOrElse(sc).parallelize(sampleList, numWorkers)
}
class XGBoostGeneralSuite extends Utils {
test("build RDD containing boosters with the specified worker number") {
val trainingRDD = buildTrainingRDD()
val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
import DataUtils._
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
val boosterRDD = XGBoost.buildDistributedBoosters(
@@ -145,14 +58,15 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
sc = null
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
val customSparkContext = new SparkContext(sparkConf)
customSparkContext.setLogLevel("ERROR")
val eval = new EvalError()
val trainingRDD = buildTrainingRDD(Some(customSparkContext))
val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
import DataUtils._
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, round = 5,
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
nWorkers = numWorkers, useExternalMemory = true)
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
testSetDMatrix) < 0.1)
@@ -194,13 +108,13 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
val testRDD = buildDenseRDD().repartition(4)
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect()
}
test("test consistency of prediction functions with RDD") {
val trainingRDD = buildTrainingRDD()
val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile)
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile)
val testRDD = sc.parallelize(testSet, numSlices = 1).map(_.features)
val testCollection = testRDD.collect()
for (i <- testSet.indices) {
@@ -208,7 +122,7 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
}
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
val predRDD = xgBoostModel.predict(testRDD)
val predResult1 = predRDD.collect()(0)
import DataUtils._
@@ -225,26 +139,25 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
}
val trainingRDD = buildTrainingRDD()
val testRDD = buildEmptyRDD()
import DataUtils._
val tempDir = Files.createTempDirectory("xgboosttest-")
val tempFile = Files.createTempFile(tempDir, "", "")
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
println(xgBoostModel.predict(testRDD).collect().length === 0)
}
test("test model consistency after save and load") {
val eval = new EvalError()
val trainingRDD = buildTrainingRDD()
val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
import DataUtils._
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
val tempDir = Files.createTempDirectory("xgboosttest-")
val tempFile = Files.createTempFile(tempDir, "", "")
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
val evalResults = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
testSetDMatrix)
assert(evalResults < 0.1)
@@ -261,12 +174,13 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
set("spark.task.cpus", "4")
val customSparkContext = new SparkContext(sparkConf)
customSparkContext.setLogLevel("ERROR")
// start another app
val trainingRDD = buildTrainingRDD(Some(customSparkContext))
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
"objective" -> "binary:logistic", "nthread" -> 6).toMap
intercept[IllegalArgumentException] {
XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
}
customSparkContext.stop()
}
@@ -279,13 +193,14 @@ class XGBoostSuite extends FunSuite with BeforeAndAfter {
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.registerKryoClasses(Array(classOf[Booster]))
val customSparkContext = new SparkContext(sparkConf)
customSparkContext.setLogLevel("ERROR")
val trainingRDD = buildTrainingRDD(Some(customSparkContext))
val testSet = readFile(getClass.getResource("/agaricus.txt.test").getFile).iterator
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
import DataUtils._
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
"objective" -> "binary:logistic").toMap
val xgBoostModel = XGBoost.train(trainingRDD, paramMap, 5, numWorkers)
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
testSetDMatrix) < 0.1)
customSparkContext.stop()