[jvm-packages] Added baseMargin to ml.dmlc.xgboost4j.LabeledPoint (#2532)
* Converted ml.dmlc.xgboost4j.LabeledPoint to Scala
This allows to easily integrate LabeledPoint with Spark DataFrame APIs,
which support encoding/decoding case classes out of the box. Alternative
solution would be to keep LabeledPoint in Java and make it a Bean by
generating boilerplate getters/setters. I have decided against that, even
thought the conversion in this PR implies a public API change.
I also had to remove the factory methods fromSparseVector and
fromDenseVector because a) they would need to be duplicated to support
overloaded calls with extra data (e.g. weight); and b) Scala would expose
them via mangled $.MODULE$ which looks ugly in Java.
Additionally, this commit makes it possible to switch to LabeledPoint in
all public APIs and effectively to pass initial margin/group as part of
the point. This seems to be the only reliable way of implementing distributed
learning with these data. Note that group size format used by single-node
XGBoost is not compatible with that scenario, since the partition split
could divide a group into two chunks.
* Switched to ml.dmlc.xgboost4j.LabeledPoint in RDD-based public APIs
Note that DataFrame-based and Flink APIs are not affected by this change.
* Removed baseMargin argument in favour of the LabeledPoint field
* Do a single pass over the partition in buildDistributedBoosters
Note that there is no formal guarantee that
val repartitioned = rdd.repartition(42)
repartitioned.zipPartitions(repartitioned.map(_ + 1)) { it1, it2, => ... }
would do a single shuffle, but in practice it seems to be always the case.
* Exposed baseMargin in DataFrame-based API
* Addressed review comments
* Pass baseMargin to XGBoost.trainWithDataFrame via params
* Reverted MLLabeledPoint in Spark APIs
As discussed, baseMargin would only be supported for DataFrame-based APIs.
* Cleaned up baseMargin tests
- Removed RDD-based test, since the option is no longer exposed via
public APIs
- Changed DataFrame-based one to check that adding a margin actually
affects the prediction
* Pleased Scalastyle
* Addressed more review comments
* Pleased scalastyle again
* Fixed XGBoost.fromBaseMarginsToArray
which always returned an array of NaNs even if base margin was not
specified. Surprisingly this only failed a few tests.
This commit is contained in:
@@ -18,8 +18,7 @@ package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.{Vectors => MLVectors}
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
trait TrainTestData {
|
||||
protected def getResourceLines(resource: String): Iterator[String] = {
|
||||
@@ -32,60 +31,60 @@ trait TrainTestData {
|
||||
Source.fromInputStream(is).getLines()
|
||||
}
|
||||
|
||||
protected def getLabeledPoints(resource: String, zeroBased: Boolean): Seq[MLLabeledPoint] = {
|
||||
protected def getLabeledPoints(resource: String, zeroBased: Boolean): Seq[XGBLabeledPoint] = {
|
||||
getResourceLines(resource).map { line =>
|
||||
val labelAndFeatures = line.split(" ")
|
||||
val label = labelAndFeatures.head.toDouble
|
||||
val values = new Array[Double](126)
|
||||
val label = labelAndFeatures.head.toFloat
|
||||
val values = new Array[Float](126)
|
||||
for (feature <- labelAndFeatures.tail) {
|
||||
val idAndValue = feature.split(":")
|
||||
if (!zeroBased) {
|
||||
values(idAndValue(0).toInt - 1) = idAndValue(1).toDouble
|
||||
values(idAndValue(0).toInt - 1) = idAndValue(1).toFloat
|
||||
} else {
|
||||
values(idAndValue(0).toInt) = idAndValue(1).toDouble
|
||||
values(idAndValue(0).toInt) = idAndValue(1).toFloat
|
||||
}
|
||||
}
|
||||
|
||||
MLLabeledPoint(label, MLVectors.dense(values))
|
||||
XGBLabeledPoint(label, null, values)
|
||||
}.toList
|
||||
}
|
||||
}
|
||||
|
||||
object Classification extends TrainTestData {
|
||||
val train: Seq[MLLabeledPoint] = getLabeledPoints("/agaricus.txt.train", zeroBased = false)
|
||||
val test: Seq[MLLabeledPoint] = getLabeledPoints("/agaricus.txt.test", zeroBased = false)
|
||||
val train: Seq[XGBLabeledPoint] = getLabeledPoints("/agaricus.txt.train", zeroBased = false)
|
||||
val test: Seq[XGBLabeledPoint] = getLabeledPoints("/agaricus.txt.test", zeroBased = false)
|
||||
}
|
||||
|
||||
object MultiClassification extends TrainTestData {
|
||||
val train: Seq[MLLabeledPoint] = getLabeledPoints("/dermatology.data")
|
||||
val train: Seq[XGBLabeledPoint] = getLabeledPoints("/dermatology.data")
|
||||
|
||||
private def getLabeledPoints(resource: String): Seq[MLLabeledPoint] = {
|
||||
private def getLabeledPoints(resource: String): Seq[XGBLabeledPoint] = {
|
||||
getResourceLines(resource).map { line =>
|
||||
val featuresAndLabel = line.split(",")
|
||||
val label = featuresAndLabel.last.toDouble - 1
|
||||
val values = new Array[Double](featuresAndLabel.length - 1)
|
||||
val label = featuresAndLabel.last.toFloat - 1
|
||||
val values = new Array[Float](featuresAndLabel.length - 1)
|
||||
values(values.length - 1) =
|
||||
if (featuresAndLabel(featuresAndLabel.length - 2) == "?") 1 else 0
|
||||
for (i <- 0 until values.length - 2) {
|
||||
values(i) = featuresAndLabel(i).toDouble
|
||||
values(i) = featuresAndLabel(i).toFloat
|
||||
}
|
||||
|
||||
MLLabeledPoint(label, MLVectors.dense(values.take(values.length - 1)))
|
||||
XGBLabeledPoint(label, null, values.take(values.length - 1))
|
||||
}.toList
|
||||
}
|
||||
}
|
||||
|
||||
object Regression extends TrainTestData {
|
||||
val train: Seq[MLLabeledPoint] = getLabeledPoints("/machine.txt.train", zeroBased = true)
|
||||
val test: Seq[MLLabeledPoint] = getLabeledPoints("/machine.txt.test", zeroBased = true)
|
||||
val train: Seq[XGBLabeledPoint] = getLabeledPoints("/machine.txt.train", zeroBased = true)
|
||||
val test: Seq[XGBLabeledPoint] = getLabeledPoints("/machine.txt.test", zeroBased = true)
|
||||
}
|
||||
|
||||
object Ranking extends TrainTestData {
|
||||
val train0: Seq[MLLabeledPoint] = getLabeledPoints("/rank-demo-0.txt.train", zeroBased = false)
|
||||
val train1: Seq[MLLabeledPoint] = getLabeledPoints("/rank-demo-1.txt.train", zeroBased = false)
|
||||
val train0: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo-0.txt.train", zeroBased = false)
|
||||
val train1: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo-1.txt.train", zeroBased = false)
|
||||
val trainGroup0: Seq[Int] = getGroups("/rank-demo-0.txt.train.group")
|
||||
val trainGroup1: Seq[Int] = getGroups("/rank-demo-1.txt.train.group")
|
||||
val test: Seq[MLLabeledPoint] = getLabeledPoints("/rank-demo.txt.test", zeroBased = false)
|
||||
val test: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo.txt.test", zeroBased = false)
|
||||
|
||||
private def getGroups(resource: String): Seq[Int] = {
|
||||
getResourceLines(resource).map(_.toInt).toList
|
||||
|
||||
@@ -18,6 +18,8 @@ package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
|
||||
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
@@ -27,19 +29,18 @@ class XGBoostConfigureSuite extends FunSuite with PerTest {
|
||||
.config("spark.kryo.classesToRegister", classOf[Booster].getName)
|
||||
|
||||
test("nthread configuration must be no larger than spark.task.cpus") {
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"nthread" -> (sc.getConf.getInt("spark.task.cpus", 1) + 1))
|
||||
intercept[IllegalArgumentException] {
|
||||
XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers)
|
||||
XGBoost.trainWithRDD(sc.parallelize(List()), paramMap, 5, numWorkers)
|
||||
}
|
||||
}
|
||||
|
||||
test("kryoSerializer test") {
|
||||
import DataUtils._
|
||||
// TODO write an isolated test for Booster.
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator, null)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
|
||||
@@ -17,20 +17,22 @@
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.DenseVector
|
||||
import org.apache.spark.ml.param.ParamMap
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostDFSuite extends FunSuite with PerTest {
|
||||
private def buildDataFrame(
|
||||
instances: Seq[MLLabeledPoint],
|
||||
labeledPoints: Seq[XGBLabeledPoint],
|
||||
numPartitions: Int = numWorkers): DataFrame = {
|
||||
val it = instances.iterator.zipWithIndex
|
||||
.map { case (instance: MLLabeledPoint, id: Int) =>
|
||||
(id, instance.label, instance.features)
|
||||
import DataUtils._
|
||||
val it = labeledPoints.iterator.zipWithIndex
|
||||
.map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
|
||||
(id, labeledPoint.label, labeledPoint.features)
|
||||
}
|
||||
|
||||
ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
|
||||
@@ -42,7 +44,6 @@ class XGBoostDFSuite extends FunSuite with PerTest {
|
||||
"objective" -> "binary:logistic")
|
||||
val trainingItr = Classification.train.iterator
|
||||
val testItr = Classification.test.iterator
|
||||
import DataUtils._
|
||||
val round = 5
|
||||
val trainDMatrix = new DMatrix(trainingItr)
|
||||
val testDMatrix = new DMatrix(testItr)
|
||||
@@ -157,7 +158,6 @@ class XGBoostDFSuite extends FunSuite with PerTest {
|
||||
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
|
||||
round = 10, nWorkers = math.min(2, numWorkers))
|
||||
val error = new EvalError
|
||||
import DataUtils._
|
||||
val testSetDMatrix = new DMatrix(testItr)
|
||||
assert(error.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
|
||||
testSetDMatrix) < 0.1)
|
||||
@@ -193,4 +193,24 @@ class XGBoostDFSuite extends FunSuite with PerTest {
|
||||
assert(model.get[Double](model.eta).get == 0.1)
|
||||
assert(model.get[Int](model.maxDepth).get == 6)
|
||||
}
|
||||
|
||||
test("test use base margin") {
|
||||
import DataUtils._
|
||||
val trainingDf = buildDataFrame(Classification.train)
|
||||
val trainingDfWithMargin = trainingDf.withColumn("margin", functions.rand())
|
||||
val testRDD = sc.parallelize(Classification.test.map(_.features))
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "baseMarginCol" -> "margin")
|
||||
|
||||
def trainPredict(df: Dataset[_]): Array[Float] = {
|
||||
XGBoost.trainWithDataFrame(df, paramMap, round = 1, numWorkers)
|
||||
.predict(testRDD)
|
||||
.map { case Array(p) => p }
|
||||
.collect()
|
||||
}
|
||||
|
||||
val pred = trainPredict(trainingDf)
|
||||
val predWithMargin = trainPredict(trainingDfWithMargin)
|
||||
assert((pred, predWithMargin).zipped.exists { case (p, pwm) => p !== pwm })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,6 @@ package ml.dmlc.xgboost4j.scala.spark
|
||||
import java.nio.file.Files
|
||||
import java.util.concurrent.LinkedBlockingDeque
|
||||
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.util.Random
|
||||
|
||||
import ml.dmlc.xgboost4j.java.Rabit
|
||||
@@ -27,8 +26,8 @@ import ml.dmlc.xgboost4j.scala.DMatrix
|
||||
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
|
||||
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.ml.feature.LabeledPoint
|
||||
import org.apache.spark.ml.linalg.{Vectors, Vector => SparkVector}
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.{DenseVector, Vectors, Vector => SparkVector}
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
@@ -82,15 +81,15 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
"objective" -> "binary:logistic").toMap,
|
||||
new java.util.HashMap[String, String](),
|
||||
numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = true,
|
||||
missing = Float.NaN, baseMargin = null)
|
||||
missing = Float.NaN)
|
||||
val boosterCount = boosterRDD.count()
|
||||
assert(boosterCount === 2)
|
||||
}
|
||||
|
||||
test("training with external memory cache") {
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
@@ -101,9 +100,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
test("training with Scala-implemented Rabit tracker") {
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
@@ -115,9 +114,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
ignore("test with fast histo depthwise") {
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
@@ -130,9 +129,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
ignore("test with fast histo lossguide") {
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
@@ -145,9 +144,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
ignore("test with fast histo lossguide with max bin") {
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
@@ -161,9 +160,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
ignore("test with fast histo depthwidth with max depth") {
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
@@ -177,9 +176,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
ignore("test with fast histo depthwidth with max depth and max bin") {
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
|
||||
"objective" -> "binary:logistic", "tree_method" -> "hist",
|
||||
@@ -193,7 +192,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
test("test with dense vectors containing missing value") {
|
||||
def buildDenseRDD(): RDD[LabeledPoint] = {
|
||||
def buildDenseRDD(): RDD[MLLabeledPoint] = {
|
||||
val numRows = 100
|
||||
val numCols = 5
|
||||
|
||||
@@ -203,23 +202,24 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
if (c == numCols - 1) -0.1 else Random.nextDouble()
|
||||
}
|
||||
|
||||
LabeledPoint(label, Vectors.dense(values))
|
||||
MLLabeledPoint(label, Vectors.dense(values))
|
||||
}
|
||||
|
||||
sc.parallelize(labeledPoints)
|
||||
}
|
||||
|
||||
val trainingRDD = buildDenseRDD().repartition(4)
|
||||
val testRDD = buildDenseRDD().repartition(4)
|
||||
val testRDD = buildDenseRDD().repartition(4).map(_.features.asInstanceOf[DenseVector])
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers,
|
||||
useExternalMemory = true)
|
||||
xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect()
|
||||
xgBoostModel.predict(testRDD, missingValue = -0.1f).collect()
|
||||
}
|
||||
|
||||
test("test consistency of prediction functions with RDD") {
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSet = Classification.test
|
||||
val testRDD = sc.parallelize(testSet, numSlices = 1).map(_.features)
|
||||
val testCollection = testRDD.collect()
|
||||
@@ -232,7 +232,6 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
val predRDD = xgBoostModel.predict(testRDD)
|
||||
val predResult1 = predRDD.collect()
|
||||
assert(testRDD.count() === predResult1.length)
|
||||
import DataUtils._
|
||||
val predResult2 = xgBoostModel.booster.predict(new DMatrix(testSet.iterator))
|
||||
for (i <- predResult1.indices; j <- predResult1(i).indices) {
|
||||
assert(predResult1(i)(j) === predResult2(i)(j))
|
||||
@@ -240,21 +239,22 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
test("test eval functions with RDD") {
|
||||
val trainingRDD = sc.parallelize(Classification.train).cache()
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML).cache()
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5, nWorkers = numWorkers)
|
||||
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5, numWorkers)
|
||||
// Nan Zhu: deprecate it for now
|
||||
// xgBoostModel.eval(trainingRDD, "eval1", iter = 5, useExternalCache = false)
|
||||
xgBoostModel.eval(trainingRDD, "eval2", evalFunc = new EvalError, useExternalCache = false)
|
||||
}
|
||||
|
||||
test("test prediction functionality with empty partition") {
|
||||
import DataUtils._
|
||||
def buildEmptyRDD(sparkContext: Option[SparkContext] = None): RDD[SparkVector] = {
|
||||
sparkContext.getOrElse(sc).parallelize(List[SparkVector](), numWorkers)
|
||||
}
|
||||
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testRDD = buildEmptyRDD()
|
||||
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
|
||||
"objective" -> "binary:logistic").toMap
|
||||
@@ -263,9 +263,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
test("test model consistency after save and load") {
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
import DataUtils._
|
||||
val eval = new EvalError()
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
val testSetDMatrix = new DMatrix(Classification.test.iterator)
|
||||
val tempDir = Files.createTempDirectory("xgboosttest-")
|
||||
val tempFile = Files.createTempFile(tempDir, "", "")
|
||||
@@ -283,9 +283,10 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
test("test save and load of different types of models") {
|
||||
import DataUtils._
|
||||
val tempDir = Files.createTempDirectory("xgboosttest-")
|
||||
val tempFile = Files.createTempFile(tempDir, "", "")
|
||||
val trainingRDD = sc.parallelize(Classification.train)
|
||||
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
|
||||
var paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:linear")
|
||||
// validate regression model
|
||||
@@ -320,7 +321,8 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
test("test use groupData") {
|
||||
val trainingRDD = sc.parallelize(Ranking.train0, numSlices = 1)
|
||||
import DataUtils._
|
||||
val trainingRDD = sc.parallelize(Ranking.train0, numSlices = 1).map(_.asML)
|
||||
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0)
|
||||
val testRDD = sc.parallelize(Ranking.test, numSlices = 1).map(_.features)
|
||||
|
||||
@@ -337,9 +339,10 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
}
|
||||
|
||||
test("test use nested groupData") {
|
||||
import DataUtils._
|
||||
val trainingRDD0 = sc.parallelize(Ranking.train0, numSlices = 1)
|
||||
val trainingRDD1 = sc.parallelize(Ranking.train1, numSlices = 1)
|
||||
val trainingRDD = trainingRDD0.union(trainingRDD1)
|
||||
val trainingRDD = trainingRDD0.union(trainingRDD1).map(_.asML)
|
||||
|
||||
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0, Ranking.trainGroup1)
|
||||
|
||||
@@ -353,27 +356,4 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
|
||||
val predResult1: Array[Array[Float]] = predRDD.collect()
|
||||
assert(testRDD.count() === predResult1.length)
|
||||
}
|
||||
|
||||
test("test use base margin") {
|
||||
val trainRDD = sc.parallelize(Ranking.train0, numSlices = 1)
|
||||
val testRDD = sc.parallelize(Ranking.test, numSlices = 1).map(_.features)
|
||||
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise")
|
||||
|
||||
val trainMargin = {
|
||||
XGBoost.trainWithRDD(trainRDD, paramMap, round = 1, nWorkers = 2)
|
||||
.predict(trainRDD.map(_.features), outputMargin = true)
|
||||
.map { case Array(m) => m }
|
||||
}
|
||||
|
||||
val xgBoostModel = XGBoost.trainWithRDD(
|
||||
trainRDD,
|
||||
paramMap,
|
||||
round = 1,
|
||||
nWorkers = 2,
|
||||
baseMargin = trainMargin)
|
||||
|
||||
assert(testRDD.count() === xgBoostModel.predict(testRDD).count())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user