[jvm-packages] separate classification and regression model and integrate with ML package (#1608)
This commit is contained in:
@@ -50,6 +50,8 @@ class EvalError extends EvalTrait {
|
||||
logger.error(ex)
|
||||
return -1f
|
||||
}
|
||||
require(predicts.length == labels.length, s"predicts length ${predicts.length} has to be" +
|
||||
s" equal with label length ${labels.length}")
|
||||
val nrow: Int = predicts.length
|
||||
for (i <- 0 until nrow) {
|
||||
if (labels(i) == 0.0 && predicts(i)(0) > 0) {
|
||||
|
||||
@@ -17,20 +17,21 @@
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.scalatest.{BeforeAndAfter, FunSuite}
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
|
||||
class SharedSparkContext extends FunSuite with BeforeAndAfter with Serializable {
|
||||
trait SharedSparkContext extends FunSuite with BeforeAndAfterAll with Serializable {
|
||||
|
||||
@transient protected implicit var sc: SparkContext = null
|
||||
|
||||
before {
|
||||
override def beforeAll() {
|
||||
// build SparkContext
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
|
||||
set("spark.driver.memory", "512m")
|
||||
sc = new SparkContext(sparkConf)
|
||||
sc.setLogLevel("ERROR")
|
||||
}
|
||||
|
||||
after {
|
||||
override def afterAll() {
|
||||
if (sc != null) {
|
||||
sc.stop()
|
||||
}
|
||||
|
||||
@@ -21,17 +21,23 @@ import java.io.File
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.io.Source
|
||||
|
||||
import ml.dmlc.xgboost4j.java.XGBoostError
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, EvalTrait}
|
||||
import org.apache.commons.logging.LogFactory
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.mllib.linalg.{DenseVector, Vector => SparkVector}
|
||||
import org.apache.spark.mllib.regression.LabeledPoint
|
||||
import org.apache.spark.ml.feature.LabeledPoint
|
||||
import org.apache.spark.ml.linalg.{DenseVector, Vector => SparkVector}
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
||||
trait Utils extends Serializable {
|
||||
protected val numWorkers = Runtime.getRuntime().availableProcessors()
|
||||
|
||||
protected var labeledPointsRDD: RDD[LabeledPoint] = null
|
||||
|
||||
protected def cleanExternalCache(prefix: String): Unit = {
|
||||
val dir = new File(".")
|
||||
for (file <- dir.listFiles() if file.getName.startsWith(prefix)) {
|
||||
file.delete()
|
||||
}
|
||||
}
|
||||
|
||||
protected def loadLabelPoints(filePath: String): List[LabeledPoint] = {
|
||||
val file = Source.fromFile(new File(filePath))
|
||||
val sampleList = new ListBuffer[LabeledPoint]
|
||||
@@ -41,6 +47,15 @@ trait Utils extends Serializable {
|
||||
sampleList.toList
|
||||
}
|
||||
|
||||
protected def loadLabelAndVector(filePath: String): List[(Double, SparkVector)] = {
|
||||
val file = Source.fromFile(new File(filePath))
|
||||
val sampleList = new ListBuffer[(Double, SparkVector)]
|
||||
for (sample <- file.getLines()) {
|
||||
sampleList += fromSVMStringToLabelAndVector(sample)
|
||||
}
|
||||
sampleList.toList
|
||||
}
|
||||
|
||||
protected def fromSVMStringToLabelAndVector(line: String): (Double, SparkVector) = {
|
||||
val labelAndFeatures = line.split(" ")
|
||||
val label = labelAndFeatures(0).toDouble
|
||||
@@ -59,7 +74,10 @@ trait Utils extends Serializable {
|
||||
}
|
||||
|
||||
protected def buildTrainingRDD(sparkContext: SparkContext): RDD[LabeledPoint] = {
|
||||
val sampleList = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile)
|
||||
sparkContext.parallelize(sampleList, numWorkers)
|
||||
if (labeledPointsRDD == null) {
|
||||
val sampleList = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile)
|
||||
labeledPointsRDD = sparkContext.parallelize(sampleList, numWorkers)
|
||||
}
|
||||
labeledPointsRDD
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostConfigureSuite extends FunSuite with Utils {
|
||||
|
||||
test("nthread configuration must be equal to spark.task.cpus") {
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
|
||||
set("spark.task.cpus", "4")
|
||||
val customSparkContext = new SparkContext(sparkConf)
|
||||
customSparkContext.setLogLevel("ERROR")
|
||||
// start another app
|
||||
val trainingRDD = buildTrainingRDD(customSparkContext)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "nthread" -> 6)
|
||||
intercept[IllegalArgumentException] {
|
||||
XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
}
|
||||
customSparkContext.stop()
|
||||
}
|
||||
|
||||
test("kryoSerializer test") {
|
||||
labeledPointsRDD = null
|
||||
val eval = new EvalError()
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
sparkConf.registerKryoClasses(Array(classOf[Booster]))
|
||||
val customSparkContext = new SparkContext(sparkConf)
|
||||
customSparkContext.setLogLevel("ERROR")
|
||||
val trainingRDD = buildTrainingRDD(customSparkContext)
|
||||
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
|
||||
import DataUtils._
|
||||
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
customSparkContext.stop()
|
||||
}
|
||||
}
|
||||
@@ -25,77 +25,27 @@ import scala.io.Source
|
||||
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.mllib.linalg.VectorUDT
|
||||
import org.apache.spark.mllib.regression.LabeledPoint
|
||||
import org.apache.spark.ml.feature.LabeledPoint
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}
|
||||
|
||||
class XGBoostDFSuite extends SharedSparkContext with Utils {
|
||||
|
||||
private def loadRow(filePath: String): List[Row] = {
|
||||
val file = Source.fromFile(new File(filePath))
|
||||
val rowList = new ListBuffer[Row]
|
||||
for (rowLine <- file.getLines()) {
|
||||
rowList += fromSVMStringToRow(rowLine)
|
||||
private var trainingDF: DataFrame = null
|
||||
|
||||
private def buildTrainingDataframe(sparkContext: Option[SparkContext] = None): DataFrame = {
|
||||
if (trainingDF == null) {
|
||||
val rowList = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile)
|
||||
val labeledPointsRDD = sparkContext.getOrElse(sc).parallelize(rowList, numWorkers)
|
||||
val sparkSession = SparkSession.builder().appName("XGBoostDFSuite").getOrCreate()
|
||||
import sparkSession.implicits._
|
||||
trainingDF = sparkSession.createDataset(labeledPointsRDD).toDF
|
||||
}
|
||||
rowList.toList
|
||||
trainingDF
|
||||
}
|
||||
|
||||
private def buildTrainingDataframe(sparkContext: Option[SparkContext] = None):
|
||||
DataFrame = {
|
||||
val rowList = loadRow(getClass.getResource("/agaricus.txt.train").getFile)
|
||||
val rowRDD = sparkContext.getOrElse(sc).parallelize(rowList, numWorkers)
|
||||
val sparkSession = SparkSession.builder().appName("XGBoostDFSuite").getOrCreate()
|
||||
sparkSession.createDataFrame(rowRDD,
|
||||
StructType(Array(StructField("label", DoubleType, nullable = false),
|
||||
StructField("features", new VectorUDT, nullable = false))))
|
||||
}
|
||||
|
||||
private def fromSVMStringToRow(line: String): Row = {
|
||||
val (label, sv) = fromSVMStringToLabelAndVector(line)
|
||||
Row(label, sv)
|
||||
}
|
||||
|
||||
test("test consistency between training with dataframe and RDD") {
|
||||
val trainingDF = buildTrainingDataframe()
|
||||
val trainingRDD = buildTrainingRDD(sc)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = false)
|
||||
val xgBoostModelWithRDD = XGBoost.trainWithRDD(trainingRDD, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = false)
|
||||
val eval = new EvalError()
|
||||
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
|
||||
import DataUtils._
|
||||
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
|
||||
assert(
|
||||
eval.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) ===
|
||||
eval.eval(xgBoostModelWithRDD.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix))
|
||||
}
|
||||
|
||||
test("test transform of dataframe-based model") {
|
||||
val trainingDF = buildTrainingDataframe()
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = false)
|
||||
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile)
|
||||
val testRowsRDD = sc.parallelize(testSet.zipWithIndex, numWorkers).map{
|
||||
case (instance: LabeledPoint, id: Int) =>
|
||||
Row(id, instance.features, instance.label)
|
||||
}
|
||||
val testDF = trainingDF.sparkSession.createDataFrame(testRowsRDD, StructType(
|
||||
Array(StructField("id", IntegerType),
|
||||
StructField("features", new VectorUDT), StructField("label", DoubleType))))
|
||||
xgBoostModelWithDF.transform(testDF).show()
|
||||
}
|
||||
|
||||
test("test order preservation of dataframe-based model") {
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
test("test consistency and order preservation of dataframe-based model") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val trainingItr = loadLabelPoints(getClass.getResource("/agaricus.txt.train").getFile).
|
||||
iterator
|
||||
val (testItr, auxTestItr) =
|
||||
@@ -105,25 +55,109 @@ class XGBoostDFSuite extends SharedSparkContext with Utils {
|
||||
val testDMatrix = new DMatrix(new JDMatrix(testItr, null))
|
||||
val xgboostModel = ScalaXGBoost.train(trainDMatrix, paramMap, 5)
|
||||
val predResultFromSeq = xgboostModel.predict(testDMatrix)
|
||||
val testRowsRDD = sc.parallelize(
|
||||
auxTestItr.toList.zipWithIndex, numWorkers).map {
|
||||
val testSetItr = auxTestItr.zipWithIndex.map {
|
||||
case (instance: LabeledPoint, id: Int) =>
|
||||
Row(id, instance.features, instance.label)
|
||||
(id, instance.features, instance.label)
|
||||
}
|
||||
val trainingDF = buildTrainingDataframe()
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = false)
|
||||
val testDF = trainingDF.sqlContext.createDataFrame(testRowsRDD, StructType(
|
||||
Array(StructField("id", IntegerType), StructField("features", new VectorUDT),
|
||||
StructField("label", DoubleType))))
|
||||
val predResultsFromDF =
|
||||
xgBoostModelWithDF.transform(testDF).collect().map(row => (row.getAs[Int]("id"),
|
||||
row.getAs[mutable.WrappedArray[Float]]("prediction"))).toMap
|
||||
val testDF = trainingDF.sparkSession.createDataFrame(testSetItr.toList).toDF(
|
||||
"id", "features", "label")
|
||||
val predResultsFromDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF).
|
||||
collect().map(row =>
|
||||
(row.getAs[Int]("id"), row.getAs[mutable.WrappedArray[Float]]("probabilities"))
|
||||
).toMap
|
||||
assert(testDF.count() === predResultsFromDF.size)
|
||||
for (i <- predResultFromSeq.indices) {
|
||||
assert(predResultFromSeq(i).length === predResultsFromDF(i).length)
|
||||
for (j <- predResultFromSeq(i).indices) {
|
||||
assert(predResultFromSeq(i)(j) === predResultsFromDF(i)(j))
|
||||
}
|
||||
}
|
||||
cleanExternalCache("XGBoostDFSuite")
|
||||
}
|
||||
|
||||
test("test transformLeaf") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val testItr = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
|
||||
val trainingDF = buildTrainingDataframe()
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = false)
|
||||
val testSetItr = testItr.zipWithIndex.map {
|
||||
case (instance: LabeledPoint, id: Int) =>
|
||||
(id, instance.features, instance.label)
|
||||
}
|
||||
val testDF = trainingDF.sparkSession.createDataFrame(testSetItr.toList).toDF(
|
||||
"id", "features", "label")
|
||||
xgBoostModelWithDF.transformLeaf(testDF).show()
|
||||
}
|
||||
|
||||
test("test schema of XGBoostRegressionModel") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
val testItr = loadLabelPoints(getClass.getResource("/machine.txt.test").getFile).iterator.
|
||||
zipWithIndex.map { case (instance: LabeledPoint, id: Int) =>
|
||||
(id, instance.features, instance.label)
|
||||
}
|
||||
val trainingDF = {
|
||||
val rowList = loadLabelPoints(getClass.getResource("/machine.txt.train").getFile)
|
||||
val labeledPointsRDD = sc.parallelize(rowList, numWorkers)
|
||||
val sparkSession = SparkSession.builder().appName("XGBoostDFSuite").getOrCreate()
|
||||
import sparkSession.implicits._
|
||||
sparkSession.createDataset(labeledPointsRDD).toDF
|
||||
}
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = true)
|
||||
xgBoostModelWithDF.setPredictionCol("final_prediction")
|
||||
val testDF = trainingDF.sparkSession.createDataFrame(testItr.toList).toDF(
|
||||
"id", "features", "label")
|
||||
val predictionDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF)
|
||||
assert(predictionDF.columns.contains("id") === true)
|
||||
assert(predictionDF.columns.contains("features") === true)
|
||||
assert(predictionDF.columns.contains("label") === true)
|
||||
assert(predictionDF.columns.contains("final_prediction") === true)
|
||||
predictionDF.show()
|
||||
cleanExternalCache("XGBoostDFSuite")
|
||||
}
|
||||
|
||||
test("test schema of XGBoostClassificationModel") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val testItr = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator.
|
||||
zipWithIndex.map { case (instance: LabeledPoint, id: Int) =>
|
||||
(id, instance.features, instance.label)
|
||||
}
|
||||
val trainingDF = buildTrainingDataframe()
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 5, nWorkers = numWorkers, useExternalMemory = true)
|
||||
xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol(
|
||||
"raw_prediction").setPredictionCol("final_prediction")
|
||||
val testDF = trainingDF.sparkSession.createDataFrame(testItr.toList).toDF(
|
||||
"id", "features", "label")
|
||||
var predictionDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF)
|
||||
assert(predictionDF.columns.contains("id") === true)
|
||||
assert(predictionDF.columns.contains("features") === true)
|
||||
assert(predictionDF.columns.contains("label") === true)
|
||||
assert(predictionDF.columns.contains("raw_prediction") === true)
|
||||
assert(predictionDF.columns.contains("final_prediction") === true)
|
||||
xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("").
|
||||
setPredictionCol("final_prediction")
|
||||
predictionDF = xgBoostModelWithDF.transform(testDF)
|
||||
assert(predictionDF.columns.contains("id") === true)
|
||||
assert(predictionDF.columns.contains("features") === true)
|
||||
assert(predictionDF.columns.contains("label") === true)
|
||||
assert(predictionDF.columns.contains("raw_prediction") === false)
|
||||
assert(predictionDF.columns.contains("final_prediction") === true)
|
||||
xgBoostModelWithDF.asInstanceOf[XGBoostClassificationModel].
|
||||
setRawPredictionCol("raw_prediction").setPredictionCol("")
|
||||
predictionDF = xgBoostModelWithDF.transform(testDF)
|
||||
assert(predictionDF.columns.contains("id") === true)
|
||||
assert(predictionDF.columns.contains("features") === true)
|
||||
assert(predictionDF.columns.contains("label") === true)
|
||||
assert(predictionDF.columns.contains("raw_prediction") === true)
|
||||
assert(predictionDF.columns.contains("final_prediction") === false)
|
||||
cleanExternalCache("XGBoostDFSuite")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,66 +16,47 @@
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import java.io.File
|
||||
import java.nio.file.Files
|
||||
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.util.Random
|
||||
|
||||
import ml.dmlc.xgboost4j.java.{Booster => JBooster, DMatrix => JDMatrix}
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost => ScalaXGBoost}
|
||||
import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors}
|
||||
import org.apache.spark.mllib.regression.LabeledPoint
|
||||
import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
|
||||
import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.ml.feature.LabeledPoint
|
||||
import org.apache.spark.ml.linalg.{Vector => SparkVector, Vectors}
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
class XGBoostGeneralSuite extends SharedSparkContext with Utils {
|
||||
|
||||
test("build RDD containing boosters with the specified worker number") {
|
||||
val trainingRDD = buildTrainingRDD(sc)
|
||||
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
|
||||
import DataUtils._
|
||||
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
|
||||
val boosterRDD = XGBoost.buildDistributedBoosters(
|
||||
trainingRDD,
|
||||
List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
|
||||
List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap,
|
||||
new scala.collection.mutable.HashMap[String, String],
|
||||
numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = false)
|
||||
numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = true)
|
||||
val boosterCount = boosterRDD.count()
|
||||
assert(boosterCount === 2)
|
||||
val boosters = boosterRDD.collect()
|
||||
val eval = new EvalError()
|
||||
for (booster <- boosters) {
|
||||
// the threshold is 0.11 because it does not sync boosters with AllReduce
|
||||
val predicts = booster.predict(testSetDMatrix, outPutMargin = true)
|
||||
assert(eval.eval(predicts, testSetDMatrix) < 0.11)
|
||||
}
|
||||
cleanExternalCache("XGBoostSuite")
|
||||
}
|
||||
|
||||
test("training with external memory cache") {
|
||||
sc.stop()
|
||||
sc = null
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
|
||||
val customSparkContext = new SparkContext(sparkConf)
|
||||
customSparkContext.setLogLevel("ERROR")
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = buildTrainingRDD(customSparkContext)
|
||||
val trainingRDD = buildTrainingRDD(sc)
|
||||
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
|
||||
import DataUtils._
|
||||
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "0",
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = true)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
customSparkContext.stop()
|
||||
// clean
|
||||
val dir = new File(".")
|
||||
for (file <- dir.listFiles() if file.getName.startsWith("XGBoostSuite-0-dtrain_cache")) {
|
||||
file.delete()
|
||||
}
|
||||
cleanExternalCache("XGBoostSuite")
|
||||
}
|
||||
|
||||
test("test with dense vectors containing missing value") {
|
||||
@@ -106,10 +87,13 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
|
||||
}
|
||||
val trainingRDD = buildDenseRDD().repartition(4)
|
||||
val testRDD = buildDenseRDD().repartition(4)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers,
|
||||
useExternalMemory = true)
|
||||
xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect()
|
||||
// clean
|
||||
cleanExternalCache("XGBoostSuite")
|
||||
}
|
||||
|
||||
test("test consistency of prediction functions with RDD") {
|
||||
@@ -120,11 +104,12 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
|
||||
for (i <- testSet.indices) {
|
||||
assert(testCollection(i).toDense.values.sameElements(testSet(i).features.toDense.values))
|
||||
}
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
val predRDD = xgBoostModel.predict(testRDD)
|
||||
val predResult1 = predRDD.collect()(0)
|
||||
assert(testRDD.count() === predResult1.length)
|
||||
import DataUtils._
|
||||
val predResult2 = xgBoostModel.booster.predict(new DMatrix(testSet.iterator))
|
||||
for (i <- predResult1.indices; j <- predResult1(i).indices) {
|
||||
@@ -134,9 +119,9 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
|
||||
|
||||
test("test eval functions with RDD") {
|
||||
val trainingRDD = buildTrainingRDD(sc).cache()
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5, nWorkers = numWorkers)
|
||||
xgBoostModel.eval(trainingRDD, "eval1", iter = 5, useExternalCache = false)
|
||||
xgBoostModel.eval(trainingRDD, "eval2", evalFunc = new EvalError, useExternalCache = false)
|
||||
}
|
||||
@@ -150,7 +135,7 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
|
||||
val testRDD = buildEmptyRDD()
|
||||
val tempDir = Files.createTempDirectory("xgboosttest-")
|
||||
val tempFile = Files.createTempFile(tempDir, "", "")
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
println(xgBoostModel.predict(testRDD).collect().length === 0)
|
||||
@@ -164,8 +149,8 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
|
||||
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
|
||||
val tempDir = Files.createTempDirectory("xgboosttest-")
|
||||
val tempFile = Files.createTempFile(tempDir, "", "")
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
val evalResults = eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix)
|
||||
@@ -177,41 +162,40 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
|
||||
assert(loadedEvalResults == evalResults)
|
||||
}
|
||||
|
||||
test("nthread configuration must be equal to spark.task.cpus") {
|
||||
sc.stop()
|
||||
sc = null
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
|
||||
set("spark.task.cpus", "4")
|
||||
val customSparkContext = new SparkContext(sparkConf)
|
||||
customSparkContext.setLogLevel("ERROR")
|
||||
// start another app
|
||||
val trainingRDD = buildTrainingRDD(customSparkContext)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "nthread" -> 6).toMap
|
||||
intercept[IllegalArgumentException] {
|
||||
XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
}
|
||||
customSparkContext.stop()
|
||||
}
|
||||
|
||||
test("kryoSerializer test") {
|
||||
sc.stop()
|
||||
sc = null
|
||||
val eval = new EvalError()
|
||||
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite")
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
sparkConf.registerKryoClasses(Array(classOf[Booster]))
|
||||
val customSparkContext = new SparkContext(sparkConf)
|
||||
customSparkContext.setLogLevel("ERROR")
|
||||
val trainingRDD = buildTrainingRDD(customSparkContext)
|
||||
val testSet = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator
|
||||
import DataUtils._
|
||||
val testSetDMatrix = new DMatrix(new JDMatrix(testSet, null))
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "0",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
assert(eval.eval(xgBoostModel.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
customSparkContext.stop()
|
||||
test("test save and load of different types of models") {
|
||||
val tempDir = Files.createTempDirectory("xgboosttest-")
|
||||
val tempFile = Files.createTempFile(tempDir, "", "")
|
||||
val trainingRDD = buildTrainingRDD(sc)
|
||||
var paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
// validate regression model
|
||||
var xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = false)
|
||||
xgBoostModel.setFeaturesCol("feature_col")
|
||||
xgBoostModel.setLabelCol("label_col")
|
||||
xgBoostModel.setPredictionCol("prediction_col")
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
var loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
assert(loadedXGBoostModel.isInstanceOf[XGBoostRegressionModel])
|
||||
assert(loadedXGBoostModel.getFeaturesCol == "feature_col")
|
||||
assert(loadedXGBoostModel.getLabelCol == "label_col")
|
||||
assert(loadedXGBoostModel.getPredictionCol == "prediction_col")
|
||||
// classification model
|
||||
paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5,
|
||||
nWorkers = numWorkers, useExternalMemory = false)
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setRawPredictionCol("raw_col")
|
||||
xgBoostModel.asInstanceOf[XGBoostClassificationModel].setThresholds(Array(0.5, 0.5))
|
||||
xgBoostModel.saveModelAsHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
loadedXGBoostModel = XGBoost.loadModelFromHadoopFile(tempFile.toFile.getAbsolutePath)
|
||||
assert(loadedXGBoostModel.isInstanceOf[XGBoostClassificationModel])
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getRawPredictionCol ==
|
||||
"raw_col")
|
||||
assert(loadedXGBoostModel.asInstanceOf[XGBoostClassificationModel].getThresholds.deep ==
|
||||
Array(0.5, 0.5).deep)
|
||||
assert(loadedXGBoostModel.getFeaturesCol == "features")
|
||||
assert(loadedXGBoostModel.getLabelCol == "label")
|
||||
assert(loadedXGBoostModel.getPredictionCol == "prediction")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user