[jvm-packages] Added baseMargin to ml.dmlc.xgboost4j.LabeledPoint (#2532)

* Converted ml.dmlc.xgboost4j.LabeledPoint to Scala

This allows to easily integrate LabeledPoint with Spark DataFrame APIs,
which support encoding/decoding case classes out of the box. Alternative
solution would be to keep LabeledPoint in Java and make it a Bean by
generating boilerplate getters/setters. I have decided against that, even
thought the conversion in this PR implies a public API change.

I also had to remove the factory methods fromSparseVector and
fromDenseVector because a) they would need to be duplicated to support
overloaded calls with extra data (e.g. weight); and b) Scala would expose
them via mangled $.MODULE$ which looks ugly in Java.

Additionally, this commit makes it possible to switch to LabeledPoint in
all public APIs and effectively to pass initial margin/group as part of
the point. This seems to be the only reliable way of implementing distributed
learning with these data. Note that group size format used by single-node
XGBoost is not compatible with that scenario, since the partition split
could divide a group into two chunks.

* Switched to ml.dmlc.xgboost4j.LabeledPoint in RDD-based public APIs

Note that DataFrame-based and Flink APIs are not affected by this change.

* Removed baseMargin argument in favour of the LabeledPoint field

* Do a single pass over the partition in buildDistributedBoosters

Note that there is no formal guarantee that

    val repartitioned = rdd.repartition(42)
    repartitioned.zipPartitions(repartitioned.map(_ + 1)) { it1, it2, => ... }

would do a single shuffle, but in practice it seems to be always the case.

* Exposed baseMargin in DataFrame-based API

* Addressed review comments

* Pass baseMargin to XGBoost.trainWithDataFrame via params

* Reverted MLLabeledPoint in Spark APIs

As discussed, baseMargin would only be supported for DataFrame-based APIs.

* Cleaned up baseMargin tests

- Removed RDD-based test, since the option is no longer exposed via
  public APIs
- Changed DataFrame-based one to check that adding a margin actually
  affects the prediction

* Pleased Scalastyle

* Addressed more review comments

* Pleased scalastyle again

* Fixed XGBoost.fromBaseMarginsToArray

which always returned an array of NaNs even if base margin was not
specified. Surprisingly this only failed a few tests.
This commit is contained in:
Sergei Lebedev 2017-08-10 23:29:26 +02:00 committed by Nan Zhu
parent c1104f7d0a
commit 771a95aec6
16 changed files with 307 additions and 265 deletions

View File

@ -16,12 +16,13 @@
package ml.dmlc.xgboost4j.scala.example.spark package ml.dmlc.xgboost4j.scala.example.spark
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix} import ml.dmlc.xgboost4j.scala.Booster
import ml.dmlc.xgboost4j.scala.spark.{DataUtils, XGBoost} import ml.dmlc.xgboost4j.scala.spark.XGBoost
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.ml.linalg.{DenseVector => MLDenseVector}
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint} import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
import org.apache.spark.ml.linalg.{DenseVector => MLDenseVector}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}
object SparkWithRDD { object SparkWithRDD {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
@ -39,11 +40,10 @@ object SparkWithRDD {
val outputModelPath = args(4) val outputModelPath = args(4)
// number of iterations // number of iterations
val numRound = args(0).toInt val numRound = args(0).toInt
import DataUtils._
val trainRDD = MLUtils.loadLibSVMFile(sc, inputTrainPath).map(lp => val trainRDD = MLUtils.loadLibSVMFile(sc, inputTrainPath).map(lp =>
MLLabeledPoint(lp.label, new MLDenseVector(lp.features.toArray))) MLLabeledPoint(lp.label, new MLDenseVector(lp.features.toArray)))
val testSet = MLUtils.loadLibSVMFile(sc, inputTestPath).collect().map( val testSet = MLUtils.loadLibSVMFile(sc, inputTestPath)
lp => new MLDenseVector(lp.features.toArray)).iterator .map(lp => new MLDenseVector(lp.features.toArray))
// training parameters // training parameters
val paramMap = List( val paramMap = List(
"eta" -> 0.1f, "eta" -> 0.1f,
@ -51,7 +51,7 @@ object SparkWithRDD {
"objective" -> "binary:logistic").toMap "objective" -> "binary:logistic").toMap
val xgboostModel = XGBoost.trainWithRDD(trainRDD, paramMap, numRound, nWorkers = args(1).toInt, val xgboostModel = XGBoost.trainWithRDD(trainRDD, paramMap, numRound, nWorkers = args(1).toInt,
useExternalMemory = true) useExternalMemory = true)
xgboostModel.booster.predict(new DMatrix(testSet)) xgboostModel.predict(testSet, missingValue = Float.NaN)
// save model to HDFS path // save model to HDFS path
xgboostModel.saveModelAsHadoopFile(outputModelPath) xgboostModel.saveModelAsHadoopFile(outputModelPath)
} }

View File

@ -16,19 +16,19 @@
package ml.dmlc.xgboost4j.scala.flink package ml.dmlc.xgboost4j.scala.flink
import scala.collection.JavaConverters.asScalaIteratorConverter; import scala.collection.JavaConverters.asScalaIteratorConverter
import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.java.{RabitTracker, Rabit} import ml.dmlc.xgboost4j.java.{Rabit, RabitTracker}
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => XGBoostScala} import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => XGBoostScala}
import org.apache.commons.logging.LogFactory import org.apache.commons.logging.LogFactory
import org.apache.flink.api.common.functions.RichMapPartitionFunction import org.apache.flink.api.common.functions.RichMapPartitionFunction
import org.apache.flink.api.scala.DataSet import org.apache.flink.api.scala.{DataSet, _}
import org.apache.flink.api.scala._
import org.apache.flink.ml.common.LabeledVector import org.apache.flink.ml.common.LabeledVector
import org.apache.flink.util.Collector import org.apache.flink.util.Collector
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
object XGBoost { object XGBoost {
/** /**
@ -49,8 +49,7 @@ object XGBoost {
Rabit.init(workerEnvs) Rabit.init(workerEnvs)
val mapper = (x: LabeledVector) => { val mapper = (x: LabeledVector) => {
val (index, value) = x.vector.toSeq.unzip val (index, value) = x.vector.toSeq.unzip
LabeledPoint.fromSparseVector(x.label.toFloat, LabeledPoint(x.label.toFloat, index.toArray, value.map(_.toFloat).toArray)
index.toArray, value.map(z => z.toFloat).toArray)
} }
val dataIter = for (x <- it.iterator().asScala) yield mapper(x) val dataIter = for (x <- it.iterator().asScala) yield mapper(x)
val trainMat = new DMatrix(dataIter, null) val trainMat = new DMatrix(dataIter, null)

View File

@ -17,13 +17,12 @@
package ml.dmlc.xgboost4j.scala.flink package ml.dmlc.xgboost4j.scala.flink
import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.{DMatrix, Booster} import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
import org.apache.flink.api.scala.DataSet
import org.apache.flink.api.scala._ import org.apache.flink.api.scala.{DataSet, _}
import org.apache.flink.ml.math.Vector import org.apache.flink.ml.math.Vector
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
class XGBoostModel (booster: Booster) extends Serializable { class XGBoostModel (booster: Booster) extends Serializable {
/** /**
@ -57,8 +56,7 @@ class XGBoostModel (booster: Booster) extends Serializable {
(it: Iterator[Vector]) => { (it: Iterator[Vector]) => {
val mapper = (x: Vector) => { val mapper = (x: Vector) => {
val (index, value) = x.toSeq.unzip val (index, value) = x.toSeq.unzip
LabeledPoint.fromSparseVector(0.0f, LabeledPoint(0.0f, index.toArray, value.map(_.toFloat).toArray)
index.toArray, value.map(z => z.toFloat).toArray)
} }
val dataIter = for (x <- it) yield mapper(x) val dataIter = for (x <- it) yield mapper(x)
val dmat = new DMatrix(dataIter, null) val dmat = new DMatrix(dataIter, null)

View File

@ -16,47 +16,55 @@
package ml.dmlc.xgboost4j.scala.spark package ml.dmlc.xgboost4j.scala.spark
import scala.collection.JavaConverters._ import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint} import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
object DataUtils extends Serializable { object DataUtils extends Serializable {
private[spark] implicit class XGBLabeledPointFeatures(
val labeledPoint: XGBLabeledPoint
) extends AnyVal {
/** Converts the point to [[MLLabeledPoint]]. */
private[spark] def asML: MLLabeledPoint = {
MLLabeledPoint(labeledPoint.label, labeledPoint.features)
}
implicit def fromSparkPointsToXGBoostPointsJava(sps: Iterator[MLLabeledPoint]) /**
: java.util.Iterator[LabeledPoint] = { * Returns feature of the point as [[org.apache.spark.ml.linalg.Vector]].
fromSparkPointsToXGBoostPoints(sps).asJava *
} * If the point is sparse, the dimensionality of the resulting sparse
* vector would be [[Int.MaxValue]]. This is the only safe value, since
implicit def fromSparkPointsToXGBoostPoints(sps: Iterator[MLLabeledPoint]): * XGBoost does not store the dimensionality explicitly.
Iterator[LabeledPoint] = { */
for (p <- sps) yield { def features: Vector = if (labeledPoint.indices == null) {
p.features match { Vectors.dense(labeledPoint.values.map(_.toDouble))
case denseFeature: DenseVector => } else {
LabeledPoint.fromDenseVector(p.label.toFloat, denseFeature.values.map(_.toFloat)) Vectors.sparse(Int.MaxValue, labeledPoint.indices, labeledPoint.values.map(_.toDouble))
case sparseFeature: SparseVector =>
LabeledPoint.fromSparseVector(p.label.toFloat, sparseFeature.indices,
sparseFeature.values.map(_.toFloat))
}
} }
} }
implicit def fromSparkVectorToXGBoostPointsJava(sps: Iterator[Vector]) private[spark] implicit class MLLabeledPointToXGBLabeledPoint(
: java.util.Iterator[LabeledPoint] = { val labeledPoint: MLLabeledPoint
fromSparkVectorToXGBoostPoints(sps).asJava ) extends AnyVal {
/** Converts an [[MLLabeledPoint]] to an [[XGBLabeledPoint]]. */
def asXGB: XGBLabeledPoint = {
labeledPoint.features.asXGB.copy(label = labeledPoint.label.toFloat)
}
} }
implicit def fromSparkVectorToXGBoostPoints(sps: Iterator[Vector]) private[spark] implicit class MLVectorToXGBLabeledPoint(val v: Vector) extends AnyVal {
: Iterator[LabeledPoint] = { /**
for (p <- sps) yield { * Converts a [[Vector]] to a data point with a dummy label.
p match { *
case denseFeature: DenseVector => * This is needed for constructing a [[ml.dmlc.xgboost4j.scala.DMatrix]]
LabeledPoint.fromDenseVector(0.0f, denseFeature.values.map(_.toFloat)) * for prediction.
case sparseFeature: SparseVector => */
LabeledPoint.fromSparseVector(0.0f, sparseFeature.indices, def asXGB: XGBLabeledPoint = v match {
sparseFeature.values.map(_.toFloat)) case v: DenseVector =>
} XGBLabeledPoint(0.0f, null, v.values.map(_.toFloat))
case v: SparseVector =>
XGBLabeledPoint(0.0f, v.indices, v.values.map(_.toFloat))
} }
} }
} }

View File

@ -21,13 +21,13 @@ import scala.collection.mutable
import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker} import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker}
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _} import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
import org.apache.commons.logging.LogFactory import org.apache.commons.logging.LogFactory
import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Dataset import org.apache.spark.sql.Dataset
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
import org.apache.spark.{SparkContext, TaskContext} import org.apache.spark.{SparkContext, TaskContext}
object TrackerConf { object TrackerConf {
@ -52,30 +52,49 @@ object XGBoost extends Serializable {
private val logger = LogFactory.getLog("XGBoostSpark") private val logger = LogFactory.getLog("XGBoostSpark")
private def fromDenseToSparseLabeledPoints( private def fromDenseToSparseLabeledPoints(
denseLabeledPoints: Iterator[MLLabeledPoint], denseLabeledPoints: Iterator[XGBLabeledPoint],
missing: Float): Iterator[MLLabeledPoint] = { missing: Float): Iterator[XGBLabeledPoint] = {
if (!missing.isNaN) { if (!missing.isNaN) {
denseLabeledPoints.map { case MLLabeledPoint(label, features) => denseLabeledPoints.map { labeledPoint =>
val dFeatures = features.toDense val indicesBuilder = new mutable.ArrayBuilder.ofInt()
val indices = new mutable.ArrayBuilder.ofInt() val valuesBuilder = new mutable.ArrayBuilder.ofFloat()
val values = new mutable.ArrayBuilder.ofDouble() for ((value, i) <- labeledPoint.values.zipWithIndex if value != missing) {
for (i <- dFeatures.values.indices) { indicesBuilder += (if (labeledPoint.indices == null) i else labeledPoint.indices(i))
if (dFeatures.values(i) != missing) { valuesBuilder += value
indices += i
values += dFeatures.values(i)
}
} }
val sFeatures = new SparseVector(dFeatures.values.length, indices.result(), labeledPoint.copy(indices = indicesBuilder.result(), values = valuesBuilder.result())
values.result())
MLLabeledPoint(label, sFeatures)
} }
} else { } else {
denseLabeledPoints denseLabeledPoints
} }
} }
private def fromBaseMarginsToArray(baseMargins: Iterator[Float]): Option[Array[Float]] = {
val builder = new mutable.ArrayBuilder.ofFloat()
var nTotal = 0
var nUndefined = 0
while (baseMargins.hasNext) {
nTotal += 1
val baseMargin = baseMargins.next()
if (baseMargin.isNaN) {
nUndefined += 1 // don't waste space for all-NaNs.
} else {
builder += baseMargin
}
}
if (nUndefined == nTotal) {
None
} else if (nUndefined == 0) {
Some(builder.result())
} else {
throw new IllegalArgumentException(
s"Encountered a partition with $nUndefined NaN base margin values. " +
"If you want to specify base margin, ensure all values are non-NaN.")
}
}
private[spark] def buildDistributedBoosters( private[spark] def buildDistributedBoosters(
trainingSet: RDD[MLLabeledPoint], trainingSet: RDD[XGBLabeledPoint],
params: Map[String, Any], params: Map[String, Any],
rabitEnv: java.util.Map[String, String], rabitEnv: java.util.Map[String, String],
numWorkers: Int, numWorkers: Int,
@ -83,25 +102,20 @@ object XGBoost extends Serializable {
obj: ObjectiveTrait, obj: ObjectiveTrait,
eval: EvalTrait, eval: EvalTrait,
useExternalMemory: Boolean, useExternalMemory: Boolean,
missing: Float, missing: Float): RDD[Booster] = {
baseMargin: RDD[Float]): RDD[Booster] = {
import DataUtils._
val partitionedTrainingSet = if (trainingSet.getNumPartitions != numWorkers) { val partitionedTrainingSet = if (trainingSet.getNumPartitions != numWorkers) {
logger.info(s"repartitioning training set to $numWorkers partitions") logger.info(s"repartitioning training set to $numWorkers partitions")
trainingSet.repartition(numWorkers) trainingSet.repartition(numWorkers)
} else { } else {
trainingSet trainingSet
} }
val partitionedBaseMargin = Option(baseMargin) val partitionedBaseMargin = partitionedTrainingSet.map(_.baseMargin)
.getOrElse(trainingSet.sparkContext.emptyRDD)
.repartition(partitionedTrainingSet.getNumPartitions)
val appName = partitionedTrainingSet.context.appName val appName = partitionedTrainingSet.context.appName
// to workaround the empty partitions in training dataset, // to workaround the empty partitions in training dataset,
// this might not be the best efficient implementation, see // this might not be the best efficient implementation, see
// (https://github.com/dmlc/xgboost/issues/1277) // (https://github.com/dmlc/xgboost/issues/1277)
partitionedTrainingSet.zipPartitions(partitionedBaseMargin) { (trainingSamples, baseMargin) => partitionedTrainingSet.zipPartitions(partitionedBaseMargin) { (trainingPoints, baseMargins) =>
if (trainingSamples.isEmpty) { if (trainingPoints.isEmpty) {
throw new XGBoostError( throw new XGBoostError(
s"detected an empty partition in the training data, partition ID:" + s"detected an empty partition in the training data, partition ID:" +
s" ${TaskContext.getPartitionId()}") s" ${TaskContext.getPartitionId()}")
@ -114,16 +128,15 @@ object XGBoost extends Serializable {
} }
rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString) rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString)
Rabit.init(rabitEnv) Rabit.init(rabitEnv)
val partitionItr = fromDenseToSparseLabeledPoints(trainingSamples, missing) val trainingMatrix = new DMatrix(
val trainingMatrix = new DMatrix(partitionItr, cacheFileName) fromDenseToSparseLabeledPoints(trainingPoints, missing), cacheFileName)
try { try {
// TODO: use group attribute from the points.
if (params.contains("groupData") && params("groupData") != null) { if (params.contains("groupData") && params("groupData") != null) {
trainingMatrix.setGroup(params("groupData").asInstanceOf[Seq[Seq[Int]]]( trainingMatrix.setGroup(params("groupData").asInstanceOf[Seq[Seq[Int]]](
TaskContext.getPartitionId()).toArray) TaskContext.getPartitionId()).toArray)
} }
if (baseMargin.nonEmpty) { fromBaseMarginsToArray(baseMargins).foreach(trainingMatrix.setBaseMargin)
trainingMatrix.setBaseMargin(baseMargin.toArray)
}
val booster = SXGBoost.train(trainingMatrix, params, round, val booster = SXGBoost.train(trainingMatrix, params, round,
watches = Map("train" -> trainingMatrix), obj, eval) watches = Map("train" -> trainingMatrix), obj, eval)
Iterator(booster) Iterator(booster)
@ -199,7 +212,6 @@ object XGBoost extends Serializable {
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as * @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
* true, the user may save the RAM cost for running XGBoost within Spark * true, the user may save the RAM cost for running XGBoost within Spark
* @param missing the value represented the missing value in the dataset * @param missing the value represented the missing value in the dataset
* @param baseMargin initial prediction for boosting.
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed * @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
* @return XGBoostModel when successful training * @return XGBoostModel when successful training
*/ */
@ -212,10 +224,9 @@ object XGBoost extends Serializable {
obj: ObjectiveTrait = null, obj: ObjectiveTrait = null,
eval: EvalTrait = null, eval: EvalTrait = null,
useExternalMemory: Boolean = false, useExternalMemory: Boolean = false,
missing: Float = Float.NaN, missing: Float = Float.NaN): XGBoostModel = {
baseMargin: RDD[Float] = null): XGBoostModel = {
trainWithRDD(trainingData, params, round, nWorkers, obj, eval, useExternalMemory, trainWithRDD(trainingData, params, round, nWorkers, obj, eval, useExternalMemory,
missing, baseMargin) missing)
} }
private def overrideParamsAccordingToTaskCPUs( private def overrideParamsAccordingToTaskCPUs(
@ -257,7 +268,6 @@ object XGBoost extends Serializable {
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as * @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
* true, the user may save the RAM cost for running XGBoost within Spark * true, the user may save the RAM cost for running XGBoost within Spark
* @param missing the value represented the missing value in the dataset * @param missing the value represented the missing value in the dataset
* @param baseMargin initial prediction for boosting.
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed * @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
* @return XGBoostModel when successful training * @return XGBoostModel when successful training
*/ */
@ -270,30 +280,46 @@ object XGBoost extends Serializable {
obj: ObjectiveTrait = null, obj: ObjectiveTrait = null,
eval: EvalTrait = null, eval: EvalTrait = null,
useExternalMemory: Boolean = false, useExternalMemory: Boolean = false,
missing: Float = Float.NaN, missing: Float = Float.NaN): XGBoostModel = {
baseMargin: RDD[Float] = null): XGBoostModel = { import DataUtils._
val xgbTrainingData = trainingData.map { case MLLabeledPoint(label, features) =>
features.asXGB.copy(label = label.toFloat)
}
trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval,
useExternalMemory, missing)
}
@throws(classOf[XGBoostError])
private[spark] def trainDistributed(
trainingData: RDD[XGBLabeledPoint],
params: Map[String, Any],
round: Int,
nWorkers: Int,
obj: ObjectiveTrait = null,
eval: EvalTrait = null,
useExternalMemory: Boolean = false,
missing: Float = Float.NaN): XGBoostModel = {
if (params.contains("tree_method")) { if (params.contains("tree_method")) {
require(params("tree_method") != "hist", "xgboost4j-spark does not support fast histogram" + require(params("tree_method") != "hist", "xgboost4j-spark does not support fast histogram" +
" for now") " for now")
} }
require(nWorkers > 0, "you must specify more than 0 workers") require(nWorkers > 0, "you must specify more than 0 workers")
if (obj != null) { if (obj != null) {
require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not defined," + require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not defined," +
" you have to specify the objective type as classification or regression with a" + " you have to specify the objective type as classification or regression with a" +
" customized objective function") " customized objective function")
} }
val trackerConf = params.get("tracker_conf") match { val trackerConf = params.get("tracker_conf") match {
case None => TrackerConf() case None => TrackerConf()
case Some(conf: TrackerConf) => conf case Some(conf: TrackerConf) => conf
case _ => throw new IllegalArgumentException("parameter \"tracker_conf\" must be an " + case _ => throw new IllegalArgumentException("parameter \"tracker_conf\" must be an " +
"instance of TrackerConf.") "instance of TrackerConf.")
} }
val tracker = startTracker(nWorkers, trackerConf) val tracker = startTracker(nWorkers, trackerConf)
try { try {
val overriddenParams = overrideParamsAccordingToTaskCPUs(params, trainingData.sparkContext) val overriddenParams = overrideParamsAccordingToTaskCPUs(params, trainingData.sparkContext)
val boosters = buildDistributedBoosters(trainingData, overriddenParams, val boosters = buildDistributedBoosters(trainingData, overriddenParams,
tracker.getWorkerEnvs, nWorkers, round, obj, eval, useExternalMemory, missing, tracker.getWorkerEnvs, nWorkers, round, obj, eval, useExternalMemory, missing)
baseMargin)
val sparkJobThread = new Thread() { val sparkJobThread = new Thread() {
override def run() { override def run() {
// force the job // force the job

View File

@ -19,23 +19,23 @@ package ml.dmlc.xgboost4j.scala.spark
import scala.collection.mutable import scala.collection.mutable
import ml.dmlc.xgboost4j.scala.spark.params._ import ml.dmlc.xgboost4j.scala.spark.params._
import org.json4s.DefaultFormats import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
import org.apache.spark.ml.Predictor import org.apache.spark.ml.Predictor
import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.ml.linalg.{Vector => MLVector}
import org.apache.spark.ml.param._ import org.apache.spark.ml.param._
import org.apache.spark.ml.util._ import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.types.FloatType
import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.{Dataset, Row}
import org.json4s.DefaultFormats
/** /**
* XGBoost Estimator to produce a XGBoost model * XGBoost Estimator to produce a XGBoost model
*/ */
class XGBoostEstimator private[spark]( class XGBoostEstimator private[spark](
override val uid: String, xgboostParams: Map[String, Any]) override val uid: String, xgboostParams: Map[String, Any])
extends Predictor[MLVector, XGBoostEstimator, XGBoostModel] extends Predictor[Vector, XGBoostEstimator, XGBoostModel]
with LearningTaskParams with GeneralParams with BoosterParams with MLWritable { with LearningTaskParams with GeneralParams with BoosterParams with MLWritable {
def this(xgboostParams: Map[String, Any]) = def this(xgboostParams: Map[String, Any]) =
@ -107,18 +107,32 @@ class XGBoostEstimator private[spark](
} }
} }
private def ensureColumns(trainingSet: Dataset[_]): Dataset[_] = {
if (trainingSet.columns.contains($(baseMarginCol))) {
trainingSet
} else {
trainingSet.withColumn($(baseMarginCol), lit(Float.NaN))
}
}
/** /**
* produce a XGBoostModel by fitting the given dataset * produce a XGBoostModel by fitting the given dataset
*/ */
override def train(trainingSet: Dataset[_]): XGBoostModel = { override def train(trainingSet: Dataset[_]): XGBoostModel = {
val instances = trainingSet.select( val instances = ensureColumns(trainingSet).select(
col($(featuresCol)), col($(labelCol)).cast(DoubleType)).rdd.map { col($(featuresCol)),
case Row(feature: MLVector, label: Double) => col($(labelCol)).cast(FloatType),
LabeledPoint(label, feature) col($(baseMarginCol)).cast(FloatType)
).rdd.map { case Row(features: Vector, label: Float, baseMargin: Float) =>
val (indices, values) = features match {
case v: SparseVector => (v.indices, v.values.map(_.toFloat))
case v: DenseVector => (null, v.values.map(_.toFloat))
}
XGBLabeledPoint(label.toFloat, indices, values, baseMargin = baseMargin)
} }
transformSchema(trainingSet.schema, logging = true) transformSchema(trainingSet.schema, logging = true)
val derivedXGBoosterParamMap = fromParamsToXGBParamMap val derivedXGBoosterParamMap = fromParamsToXGBParamMap
val trainedModel = XGBoost.trainWithRDD(instances, derivedXGBoosterParamMap, val trainedModel = XGBoost.trainDistributed(instances, derivedXGBoosterParamMap,
$(round), $(nWorkers), $(customObj), $(customEval), $(useExternalMemory), $(round), $(nWorkers), $(customObj), $(customEval), $(useExternalMemory),
$(missing)).setParent(this) $(missing)).setParent(this)
val returnedModel = copyValues(trainedModel, extractParamMap()) val returnedModel = copyValues(trainedModel, extractParamMap())

View File

@ -21,6 +21,7 @@ import scala.collection.JavaConverters._
import ml.dmlc.xgboost4j.java.Rabit import ml.dmlc.xgboost4j.java.Rabit
import ml.dmlc.xgboost4j.scala.spark.params.{BoosterParams, DefaultXGBoostParamsWriter} import ml.dmlc.xgboost4j.scala.spark.params.{BoosterParams, DefaultXGBoostParamsWriter}
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait} import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait}
import org.apache.hadoop.fs.{FSDataOutputStream, Path} import org.apache.hadoop.fs.{FSDataOutputStream, Path}
import org.apache.spark.ml.PredictionModel import org.apache.spark.ml.PredictionModel
@ -66,7 +67,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
val rabitEnv = Map("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString) val rabitEnv = Map("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString)
Rabit.init(rabitEnv.asJava) Rabit.init(rabitEnv.asJava)
if (testSamples.nonEmpty) { if (testSamples.nonEmpty) {
val dMatrix = new DMatrix(testSamples) val dMatrix = new DMatrix(testSamples.map(_.asXGB))
try { try {
broadcastBooster.value.predictLeaf(dMatrix).iterator broadcastBooster.value.predictLeaf(dMatrix).iterator
} finally { } finally {
@ -103,6 +104,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
val appName = evalDataset.context.appName val appName = evalDataset.context.appName
val allEvalMetrics = evalDataset.mapPartitions { val allEvalMetrics = evalDataset.mapPartitions {
labeledPointsPartition => labeledPointsPartition =>
import DataUtils._
if (labeledPointsPartition.hasNext) { if (labeledPointsPartition.hasNext) {
val rabitEnv = Map("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString) val rabitEnv = Map("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString)
Rabit.init(rabitEnv.asJava) Rabit.init(rabitEnv.asJava)
@ -114,8 +116,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
null null
} }
} }
import DataUtils._ val dMatrix = new DMatrix(labeledPointsPartition.map(_.features.asXGB), cacheFileName)
val dMatrix = new DMatrix(labeledPointsPartition, cacheFileName)
try { try {
if (groupData != null) { if (groupData != null) {
dMatrix.setGroup(groupData(TaskContext.getPartitionId()).toArray) dMatrix.setGroup(groupData(TaskContext.getPartitionId()).toArray)
@ -202,7 +203,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
null null
} }
} }
val dMatrix = new DMatrix(testSamples, cacheFileName) val dMatrix = new DMatrix(testSamples.map(_.asXGB), cacheFileName)
try { try {
broadcastBooster.value.predict(dMatrix).iterator broadcastBooster.value.predict(dMatrix).iterator
} finally { } finally {
@ -250,7 +251,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
null null
} }
} }
val testDataset = new DMatrix(vectorIterator, cachePrefix) val testDataset = new DMatrix(vectorIterator.map(_.asXGB), cachePrefix)
try { try {
val rawPredictResults = { val rawPredictResults = {
if (!predLeaf) { if (!predLeaf) {

View File

@ -60,7 +60,13 @@ trait LearningTaskParams extends Params {
val groupData = new GroupDataParam(this, "groupData", "group data specify each group size" + val groupData = new GroupDataParam(this, "groupData", "group data specify each group size" +
" for ranking task. To correspond to partition of training data, it is nested.") " for ranking task. To correspond to partition of training data, it is nested.")
setDefault(objective -> "reg:linear", baseScore -> 0.5, numClasses -> 2, groupData -> null) /**
* Initial prediction (aka base margin) column name.
*/
val baseMarginCol = new Param[String](this, "baseMarginCol", "base margin column name")
setDefault(objective -> "reg:linear", baseScore -> 0.5, numClasses -> 2, groupData -> null,
baseMarginCol -> "baseMargin")
} }
private[spark] object LearningTaskParams { private[spark] object LearningTaskParams {

View File

@ -18,8 +18,7 @@ package ml.dmlc.xgboost4j.scala.spark
import scala.io.Source import scala.io.Source
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint} import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
import org.apache.spark.ml.linalg.{Vectors => MLVectors}
trait TrainTestData { trait TrainTestData {
protected def getResourceLines(resource: String): Iterator[String] = { protected def getResourceLines(resource: String): Iterator[String] = {
@ -32,60 +31,60 @@ trait TrainTestData {
Source.fromInputStream(is).getLines() Source.fromInputStream(is).getLines()
} }
protected def getLabeledPoints(resource: String, zeroBased: Boolean): Seq[MLLabeledPoint] = { protected def getLabeledPoints(resource: String, zeroBased: Boolean): Seq[XGBLabeledPoint] = {
getResourceLines(resource).map { line => getResourceLines(resource).map { line =>
val labelAndFeatures = line.split(" ") val labelAndFeatures = line.split(" ")
val label = labelAndFeatures.head.toDouble val label = labelAndFeatures.head.toFloat
val values = new Array[Double](126) val values = new Array[Float](126)
for (feature <- labelAndFeatures.tail) { for (feature <- labelAndFeatures.tail) {
val idAndValue = feature.split(":") val idAndValue = feature.split(":")
if (!zeroBased) { if (!zeroBased) {
values(idAndValue(0).toInt - 1) = idAndValue(1).toDouble values(idAndValue(0).toInt - 1) = idAndValue(1).toFloat
} else { } else {
values(idAndValue(0).toInt) = idAndValue(1).toDouble values(idAndValue(0).toInt) = idAndValue(1).toFloat
} }
} }
MLLabeledPoint(label, MLVectors.dense(values)) XGBLabeledPoint(label, null, values)
}.toList }.toList
} }
} }
object Classification extends TrainTestData { object Classification extends TrainTestData {
val train: Seq[MLLabeledPoint] = getLabeledPoints("/agaricus.txt.train", zeroBased = false) val train: Seq[XGBLabeledPoint] = getLabeledPoints("/agaricus.txt.train", zeroBased = false)
val test: Seq[MLLabeledPoint] = getLabeledPoints("/agaricus.txt.test", zeroBased = false) val test: Seq[XGBLabeledPoint] = getLabeledPoints("/agaricus.txt.test", zeroBased = false)
} }
object MultiClassification extends TrainTestData { object MultiClassification extends TrainTestData {
val train: Seq[MLLabeledPoint] = getLabeledPoints("/dermatology.data") val train: Seq[XGBLabeledPoint] = getLabeledPoints("/dermatology.data")
private def getLabeledPoints(resource: String): Seq[MLLabeledPoint] = { private def getLabeledPoints(resource: String): Seq[XGBLabeledPoint] = {
getResourceLines(resource).map { line => getResourceLines(resource).map { line =>
val featuresAndLabel = line.split(",") val featuresAndLabel = line.split(",")
val label = featuresAndLabel.last.toDouble - 1 val label = featuresAndLabel.last.toFloat - 1
val values = new Array[Double](featuresAndLabel.length - 1) val values = new Array[Float](featuresAndLabel.length - 1)
values(values.length - 1) = values(values.length - 1) =
if (featuresAndLabel(featuresAndLabel.length - 2) == "?") 1 else 0 if (featuresAndLabel(featuresAndLabel.length - 2) == "?") 1 else 0
for (i <- 0 until values.length - 2) { for (i <- 0 until values.length - 2) {
values(i) = featuresAndLabel(i).toDouble values(i) = featuresAndLabel(i).toFloat
} }
MLLabeledPoint(label, MLVectors.dense(values.take(values.length - 1))) XGBLabeledPoint(label, null, values.take(values.length - 1))
}.toList }.toList
} }
} }
object Regression extends TrainTestData { object Regression extends TrainTestData {
val train: Seq[MLLabeledPoint] = getLabeledPoints("/machine.txt.train", zeroBased = true) val train: Seq[XGBLabeledPoint] = getLabeledPoints("/machine.txt.train", zeroBased = true)
val test: Seq[MLLabeledPoint] = getLabeledPoints("/machine.txt.test", zeroBased = true) val test: Seq[XGBLabeledPoint] = getLabeledPoints("/machine.txt.test", zeroBased = true)
} }
object Ranking extends TrainTestData { object Ranking extends TrainTestData {
val train0: Seq[MLLabeledPoint] = getLabeledPoints("/rank-demo-0.txt.train", zeroBased = false) val train0: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo-0.txt.train", zeroBased = false)
val train1: Seq[MLLabeledPoint] = getLabeledPoints("/rank-demo-1.txt.train", zeroBased = false) val train1: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo-1.txt.train", zeroBased = false)
val trainGroup0: Seq[Int] = getGroups("/rank-demo-0.txt.train.group") val trainGroup0: Seq[Int] = getGroups("/rank-demo-0.txt.train.group")
val trainGroup1: Seq[Int] = getGroups("/rank-demo-1.txt.train.group") val trainGroup1: Seq[Int] = getGroups("/rank-demo-1.txt.train.group")
val test: Seq[MLLabeledPoint] = getLabeledPoints("/rank-demo.txt.test", zeroBased = false) val test: Seq[XGBLabeledPoint] = getLabeledPoints("/rank-demo.txt.test", zeroBased = false)
private def getGroups(resource: String): Seq[Int] = { private def getGroups(resource: String): Seq[Int] = {
getResourceLines(resource).map(_.toInt).toList getResourceLines(resource).map(_.toInt).toList

View File

@ -18,6 +18,8 @@ package ml.dmlc.xgboost4j.scala.spark
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix} import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite import org.scalatest.FunSuite
@ -27,19 +29,18 @@ class XGBoostConfigureSuite extends FunSuite with PerTest {
.config("spark.kryo.classesToRegister", classOf[Booster].getName) .config("spark.kryo.classesToRegister", classOf[Booster].getName)
test("nthread configuration must be no larger than spark.task.cpus") { test("nthread configuration must be no larger than spark.task.cpus") {
val trainingRDD = sc.parallelize(Classification.train)
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1", val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
"objective" -> "binary:logistic", "objective" -> "binary:logistic",
"nthread" -> (sc.getConf.getInt("spark.task.cpus", 1) + 1)) "nthread" -> (sc.getConf.getInt("spark.task.cpus", 1) + 1))
intercept[IllegalArgumentException] { intercept[IllegalArgumentException] {
XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers) XGBoost.trainWithRDD(sc.parallelize(List()), paramMap, 5, numWorkers)
} }
} }
test("kryoSerializer test") { test("kryoSerializer test") {
import DataUtils._ import DataUtils._
// TODO write an isolated test for Booster. // TODO write an isolated test for Booster.
val trainingRDD = sc.parallelize(Classification.train) val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator, null) val testSetDMatrix = new DMatrix(Classification.test.iterator, null)
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1", val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
"objective" -> "binary:logistic") "objective" -> "binary:logistic")

View File

@ -17,20 +17,22 @@
package ml.dmlc.xgboost4j.scala.spark package ml.dmlc.xgboost4j.scala.spark
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost} import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.ParamMap
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.scalatest.FunSuite import org.scalatest.FunSuite
class XGBoostDFSuite extends FunSuite with PerTest { class XGBoostDFSuite extends FunSuite with PerTest {
private def buildDataFrame( private def buildDataFrame(
instances: Seq[MLLabeledPoint], labeledPoints: Seq[XGBLabeledPoint],
numPartitions: Int = numWorkers): DataFrame = { numPartitions: Int = numWorkers): DataFrame = {
val it = instances.iterator.zipWithIndex import DataUtils._
.map { case (instance: MLLabeledPoint, id: Int) => val it = labeledPoints.iterator.zipWithIndex
(id, instance.label, instance.features) .map { case (labeledPoint: XGBLabeledPoint, id: Int) =>
(id, labeledPoint.label, labeledPoint.features)
} }
ss.createDataFrame(sc.parallelize(it.toList, numPartitions)) ss.createDataFrame(sc.parallelize(it.toList, numPartitions))
@ -42,7 +44,6 @@ class XGBoostDFSuite extends FunSuite with PerTest {
"objective" -> "binary:logistic") "objective" -> "binary:logistic")
val trainingItr = Classification.train.iterator val trainingItr = Classification.train.iterator
val testItr = Classification.test.iterator val testItr = Classification.test.iterator
import DataUtils._
val round = 5 val round = 5
val trainDMatrix = new DMatrix(trainingItr) val trainDMatrix = new DMatrix(trainingItr)
val testDMatrix = new DMatrix(testItr) val testDMatrix = new DMatrix(testItr)
@ -157,7 +158,6 @@ class XGBoostDFSuite extends FunSuite with PerTest {
val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap, val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap,
round = 10, nWorkers = math.min(2, numWorkers)) round = 10, nWorkers = math.min(2, numWorkers))
val error = new EvalError val error = new EvalError
import DataUtils._
val testSetDMatrix = new DMatrix(testItr) val testSetDMatrix = new DMatrix(testItr)
assert(error.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true), assert(error.eval(xgBoostModelWithDF.booster.predict(testSetDMatrix, outPutMargin = true),
testSetDMatrix) < 0.1) testSetDMatrix) < 0.1)
@ -193,4 +193,24 @@ class XGBoostDFSuite extends FunSuite with PerTest {
assert(model.get[Double](model.eta).get == 0.1) assert(model.get[Double](model.eta).get == 0.1)
assert(model.get[Int](model.maxDepth).get == 6) assert(model.get[Int](model.maxDepth).get == 6)
} }
test("test use base margin") {
import DataUtils._
val trainingDf = buildDataFrame(Classification.train)
val trainingDfWithMargin = trainingDf.withColumn("margin", functions.rand())
val testRDD = sc.parallelize(Classification.test.map(_.features))
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "baseMarginCol" -> "margin")
def trainPredict(df: Dataset[_]): Array[Float] = {
XGBoost.trainWithDataFrame(df, paramMap, round = 1, numWorkers)
.predict(testRDD)
.map { case Array(p) => p }
.collect()
}
val pred = trainPredict(trainingDf)
val predWithMargin = trainPredict(trainingDfWithMargin)
assert((pred, predWithMargin).zipped.exists { case (p, pwm) => p !== pwm })
}
} }

View File

@ -19,7 +19,6 @@ package ml.dmlc.xgboost4j.scala.spark
import java.nio.file.Files import java.nio.file.Files
import java.util.concurrent.LinkedBlockingDeque import java.util.concurrent.LinkedBlockingDeque
import scala.collection.mutable.ListBuffer
import scala.util.Random import scala.util.Random
import ml.dmlc.xgboost4j.java.Rabit import ml.dmlc.xgboost4j.java.Rabit
@ -27,8 +26,8 @@ import ml.dmlc.xgboost4j.scala.DMatrix
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
import org.apache.spark.ml.linalg.{Vectors, Vector => SparkVector} import org.apache.spark.ml.linalg.{DenseVector, Vectors, Vector => SparkVector}
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite import org.scalatest.FunSuite
@ -82,15 +81,15 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
"objective" -> "binary:logistic").toMap, "objective" -> "binary:logistic").toMap,
new java.util.HashMap[String, String](), new java.util.HashMap[String, String](),
numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = true, numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = true,
missing = Float.NaN, baseMargin = null) missing = Float.NaN)
val boosterCount = boosterRDD.count() val boosterCount = boosterRDD.count()
assert(boosterCount === 2) assert(boosterCount === 2)
} }
test("training with external memory cache") { test("training with external memory cache") {
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train)
import DataUtils._ import DataUtils._
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator) val testSetDMatrix = new DMatrix(Classification.test.iterator)
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1", val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic").toMap "objective" -> "binary:logistic").toMap
@ -101,9 +100,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
test("training with Scala-implemented Rabit tracker") { test("training with Scala-implemented Rabit tracker") {
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train)
import DataUtils._ import DataUtils._
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator) val testSetDMatrix = new DMatrix(Classification.test.iterator)
val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1", val paramMap = List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "objective" -> "binary:logistic",
@ -115,9 +114,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
ignore("test with fast histo depthwise") { ignore("test with fast histo depthwise") {
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train)
import DataUtils._ import DataUtils._
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator) val testSetDMatrix = new DMatrix(Classification.test.iterator)
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6", "silent" -> "1", val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "tree_method" -> "hist", "objective" -> "binary:logistic", "tree_method" -> "hist",
@ -130,9 +129,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
ignore("test with fast histo lossguide") { ignore("test with fast histo lossguide") {
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train)
import DataUtils._ import DataUtils._
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator) val testSetDMatrix = new DMatrix(Classification.test.iterator)
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "1", val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "1",
"objective" -> "binary:logistic", "tree_method" -> "hist", "objective" -> "binary:logistic", "tree_method" -> "hist",
@ -145,9 +144,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
ignore("test with fast histo lossguide with max bin") { ignore("test with fast histo lossguide with max bin") {
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train)
import DataUtils._ import DataUtils._
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator) val testSetDMatrix = new DMatrix(Classification.test.iterator)
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0", val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
"objective" -> "binary:logistic", "tree_method" -> "hist", "objective" -> "binary:logistic", "tree_method" -> "hist",
@ -161,9 +160,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
ignore("test with fast histo depthwidth with max depth") { ignore("test with fast histo depthwidth with max depth") {
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train)
import DataUtils._ import DataUtils._
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator) val testSetDMatrix = new DMatrix(Classification.test.iterator)
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0", val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
"objective" -> "binary:logistic", "tree_method" -> "hist", "objective" -> "binary:logistic", "tree_method" -> "hist",
@ -177,9 +176,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
ignore("test with fast histo depthwidth with max depth and max bin") { ignore("test with fast histo depthwidth with max depth and max bin") {
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train)
import DataUtils._ import DataUtils._
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator) val testSetDMatrix = new DMatrix(Classification.test.iterator)
val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0", val paramMap = Map("eta" -> "1", "gamma" -> "0.5", "max_depth" -> "0", "silent" -> "0",
"objective" -> "binary:logistic", "tree_method" -> "hist", "objective" -> "binary:logistic", "tree_method" -> "hist",
@ -193,7 +192,7 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
test("test with dense vectors containing missing value") { test("test with dense vectors containing missing value") {
def buildDenseRDD(): RDD[LabeledPoint] = { def buildDenseRDD(): RDD[MLLabeledPoint] = {
val numRows = 100 val numRows = 100
val numCols = 5 val numCols = 5
@ -203,23 +202,24 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
if (c == numCols - 1) -0.1 else Random.nextDouble() if (c == numCols - 1) -0.1 else Random.nextDouble()
} }
LabeledPoint(label, Vectors.dense(values)) MLLabeledPoint(label, Vectors.dense(values))
} }
sc.parallelize(labeledPoints) sc.parallelize(labeledPoints)
} }
val trainingRDD = buildDenseRDD().repartition(4) val trainingRDD = buildDenseRDD().repartition(4)
val testRDD = buildDenseRDD().repartition(4) val testRDD = buildDenseRDD().repartition(4).map(_.features.asInstanceOf[DenseVector])
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1", val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
"objective" -> "binary:logistic").toMap "objective" -> "binary:logistic").toMap
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers, val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, 5, numWorkers,
useExternalMemory = true) useExternalMemory = true)
xgBoostModel.predict(testRDD.map(_.features.toDense), missingValue = -0.1f).collect() xgBoostModel.predict(testRDD, missingValue = -0.1f).collect()
} }
test("test consistency of prediction functions with RDD") { test("test consistency of prediction functions with RDD") {
val trainingRDD = sc.parallelize(Classification.train) import DataUtils._
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSet = Classification.test val testSet = Classification.test
val testRDD = sc.parallelize(testSet, numSlices = 1).map(_.features) val testRDD = sc.parallelize(testSet, numSlices = 1).map(_.features)
val testCollection = testRDD.collect() val testCollection = testRDD.collect()
@ -232,7 +232,6 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
val predRDD = xgBoostModel.predict(testRDD) val predRDD = xgBoostModel.predict(testRDD)
val predResult1 = predRDD.collect() val predResult1 = predRDD.collect()
assert(testRDD.count() === predResult1.length) assert(testRDD.count() === predResult1.length)
import DataUtils._
val predResult2 = xgBoostModel.booster.predict(new DMatrix(testSet.iterator)) val predResult2 = xgBoostModel.booster.predict(new DMatrix(testSet.iterator))
for (i <- predResult1.indices; j <- predResult1(i).indices) { for (i <- predResult1.indices; j <- predResult1(i).indices) {
assert(predResult1(i)(j) === predResult2(i)(j)) assert(predResult1(i)(j) === predResult2(i)(j))
@ -240,21 +239,22 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
test("test eval functions with RDD") { test("test eval functions with RDD") {
val trainingRDD = sc.parallelize(Classification.train).cache() import DataUtils._
val trainingRDD = sc.parallelize(Classification.train).map(_.asML).cache()
val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1", val paramMap = Map("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
"objective" -> "binary:logistic") "objective" -> "binary:logistic")
val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5, nWorkers = numWorkers) val xgBoostModel = XGBoost.trainWithRDD(trainingRDD, paramMap, round = 5, numWorkers)
// Nan Zhu: deprecate it for now // Nan Zhu: deprecate it for now
// xgBoostModel.eval(trainingRDD, "eval1", iter = 5, useExternalCache = false) // xgBoostModel.eval(trainingRDD, "eval1", iter = 5, useExternalCache = false)
xgBoostModel.eval(trainingRDD, "eval2", evalFunc = new EvalError, useExternalCache = false) xgBoostModel.eval(trainingRDD, "eval2", evalFunc = new EvalError, useExternalCache = false)
} }
test("test prediction functionality with empty partition") { test("test prediction functionality with empty partition") {
import DataUtils._
def buildEmptyRDD(sparkContext: Option[SparkContext] = None): RDD[SparkVector] = { def buildEmptyRDD(sparkContext: Option[SparkContext] = None): RDD[SparkVector] = {
sparkContext.getOrElse(sc).parallelize(List[SparkVector](), numWorkers) sparkContext.getOrElse(sc).parallelize(List[SparkVector](), numWorkers)
} }
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val trainingRDD = sc.parallelize(Classification.train)
val testRDD = buildEmptyRDD() val testRDD = buildEmptyRDD()
val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1", val paramMap = List("eta" -> "1", "max_depth" -> "2", "silent" -> "1",
"objective" -> "binary:logistic").toMap "objective" -> "binary:logistic").toMap
@ -263,9 +263,9 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
test("test model consistency after save and load") { test("test model consistency after save and load") {
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train)
import DataUtils._ import DataUtils._
val eval = new EvalError()
val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
val testSetDMatrix = new DMatrix(Classification.test.iterator) val testSetDMatrix = new DMatrix(Classification.test.iterator)
val tempDir = Files.createTempDirectory("xgboosttest-") val tempDir = Files.createTempDirectory("xgboosttest-")
val tempFile = Files.createTempFile(tempDir, "", "") val tempFile = Files.createTempFile(tempDir, "", "")
@ -283,9 +283,10 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
test("test save and load of different types of models") { test("test save and load of different types of models") {
import DataUtils._
val tempDir = Files.createTempDirectory("xgboosttest-") val tempDir = Files.createTempDirectory("xgboosttest-")
val tempFile = Files.createTempFile(tempDir, "", "") val tempFile = Files.createTempFile(tempDir, "", "")
val trainingRDD = sc.parallelize(Classification.train) val trainingRDD = sc.parallelize(Classification.train).map(_.asML)
var paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1", var paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "reg:linear") "objective" -> "reg:linear")
// validate regression model // validate regression model
@ -320,7 +321,8 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
test("test use groupData") { test("test use groupData") {
val trainingRDD = sc.parallelize(Ranking.train0, numSlices = 1) import DataUtils._
val trainingRDD = sc.parallelize(Ranking.train0, numSlices = 1).map(_.asML)
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0) val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0)
val testRDD = sc.parallelize(Ranking.test, numSlices = 1).map(_.features) val testRDD = sc.parallelize(Ranking.test, numSlices = 1).map(_.features)
@ -337,9 +339,10 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
} }
test("test use nested groupData") { test("test use nested groupData") {
import DataUtils._
val trainingRDD0 = sc.parallelize(Ranking.train0, numSlices = 1) val trainingRDD0 = sc.parallelize(Ranking.train0, numSlices = 1)
val trainingRDD1 = sc.parallelize(Ranking.train1, numSlices = 1) val trainingRDD1 = sc.parallelize(Ranking.train1, numSlices = 1)
val trainingRDD = trainingRDD0.union(trainingRDD1) val trainingRDD = trainingRDD0.union(trainingRDD1).map(_.asML)
val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0, Ranking.trainGroup1) val trainGroupData: Seq[Seq[Int]] = Seq(Ranking.trainGroup0, Ranking.trainGroup1)
@ -353,27 +356,4 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
val predResult1: Array[Array[Float]] = predRDD.collect() val predResult1: Array[Array[Float]] = predRDD.collect()
assert(testRDD.count() === predResult1.length) assert(testRDD.count() === predResult1.length)
} }
test("test use base margin") {
val trainRDD = sc.parallelize(Ranking.train0, numSlices = 1)
val testRDD = sc.parallelize(Ranking.test, numSlices = 1).map(_.features)
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "rank:pairwise")
val trainMargin = {
XGBoost.trainWithRDD(trainRDD, paramMap, round = 1, nWorkers = 2)
.predict(trainRDD.map(_.features), outputMargin = true)
.map { case Array(m) => m }
}
val xgBoostModel = XGBoost.trainWithRDD(
trainRDD,
paramMap,
round = 1,
nWorkers = 2,
baseMargin = trainMargin)
assert(testRDD.count() === xgBoostModel.predict(testRDD).count())
}
} }

View File

@ -1,48 +0,0 @@
package ml.dmlc.xgboost4j;
import java.io.Serializable;
/**
* Labeled data point for training examples.
* Represent a sparse training instance.
*/
public class LabeledPoint implements Serializable {
/** Label of the point */
public float label;
/** Weight of this data point */
public float weight = 1.0f;
/** Feature indices, used for sparse input */
public int[] indices = null;
/** Feature values */
public float[] values;
private LabeledPoint() {}
/**
* Create Labeled data point from sparse vector.
* @param label The label of the data point.
* @param indices The indices
* @param values The values.
*/
public static LabeledPoint fromSparseVector(float label, int[] indices, float[] values) {
LabeledPoint ret = new LabeledPoint();
ret.label = label;
ret.indices = indices;
ret.values = values;
assert indices.length == values.length;
return ret;
}
/**
* Create Labeled data point from dense vector.
* @param label The label of the data point.
* @param values The values.
*/
public static LabeledPoint fromDenseVector(float label, float[] values) {
LabeledPoint ret = new LabeledPoint();
ret.label = label;
ret.indices = null;
ret.values = values;
return ret;
}
}

View File

@ -55,7 +55,7 @@ class DataBatch {
while (base.hasNext() && batch.size() < batchSize) { while (base.hasNext() && batch.size() < batchSize) {
LabeledPoint labeledPoint = base.next(); LabeledPoint labeledPoint = base.next();
batch.add(labeledPoint); batch.add(labeledPoint);
numElem += labeledPoint.values.length; numElem += labeledPoint.values().length;
numRows++; numRows++;
} }
@ -68,18 +68,19 @@ class DataBatch {
for (int i = 0; i < batch.size(); i++) { for (int i = 0; i < batch.size(); i++) {
LabeledPoint labeledPoint = batch.get(i); LabeledPoint labeledPoint = batch.get(i);
rowOffset[i] = offset; rowOffset[i] = offset;
label[i] = labeledPoint.label; label[i] = labeledPoint.label();
if (labeledPoint.indices != null) { if (labeledPoint.indices() != null) {
System.arraycopy(labeledPoint.indices, 0, featureIndex, offset, System.arraycopy(labeledPoint.indices(), 0, featureIndex, offset,
labeledPoint.indices.length); labeledPoint.indices().length);
} else { } else {
for (int j = 0; j < labeledPoint.values.length; j++) { for (int j = 0; j < labeledPoint.values().length; j++) {
featureIndex[offset + j] = j; featureIndex[offset + j] = j;
} }
} }
System.arraycopy(labeledPoint.values, 0, featureValue, offset, labeledPoint.values.length); System.arraycopy(labeledPoint.values(), 0, featureValue, offset,
offset += labeledPoint.values.length; labeledPoint.values().length);
offset += labeledPoint.values().length;
} }
rowOffset[batch.size()] = offset; rowOffset[batch.size()] = offset;

View File

@ -0,0 +1,41 @@
/*
Copyright (c) 2014 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j
/** Labeled training data point. */
private[xgboost4j] case class LabeledPoint(
/** Label of this point. */
label: Float,
/** Feature indices of this point or `null` if the data is dense. */
indices: Array[Int],
/** Feature values of this point. */
values: Array[Float],
/** Weight of this point. */
weight: Float = 1.0f,
/** Group of this point (used for ranking) or -1. */
group: Int = -1,
/** Initial prediction on this point or `Float.NaN`. */
baseMargin: Float = Float.NaN
) extends Serializable {
require(indices == null || indices.length == values.length,
"indices and values must have the same number of elements")
def this(label: Float, indices: Array[Int], values: Array[Float]) = {
// [[weight]] default duplicated to disambiguate the constructor call.
this(label, indices, values, 1.0f)
}
}

View File

@ -15,15 +15,11 @@
*/ */
package ml.dmlc.xgboost4j.java; package ml.dmlc.xgboost4j.java;
import java.awt.*;
import java.util.Arrays; import java.util.Arrays;
import java.util.Random; import java.util.Random;
import junit.framework.TestCase; import junit.framework.TestCase;
import ml.dmlc.xgboost4j.LabeledPoint; import ml.dmlc.xgboost4j.LabeledPoint;
import ml.dmlc.xgboost4j.java.DMatrix;
import ml.dmlc.xgboost4j.java.DataBatch;
import ml.dmlc.xgboost4j.java.XGBoostError;
import org.junit.Test; import org.junit.Test;
/** /**
@ -41,10 +37,10 @@ public class DMatrixTest {
int nrep = 3000; int nrep = 3000;
java.util.List<LabeledPoint> blist = new java.util.LinkedList<LabeledPoint>(); java.util.List<LabeledPoint> blist = new java.util.LinkedList<LabeledPoint>();
for (int i = 0; i < nrep; ++i) { for (int i = 0; i < nrep; ++i) {
LabeledPoint p = LabeledPoint.fromSparseVector( LabeledPoint p = new LabeledPoint(
0.1f + i, new int[]{0, 2, 3}, new float[]{3, 4, 5}); 0.1f + i, new int[]{0, 2, 3}, new float[]{3, 4, 5});
blist.add(p); blist.add(p);
labelall.add(p.label); labelall.add(p.label());
} }
DMatrix dmat = new DMatrix(blist.iterator(), null); DMatrix dmat = new DMatrix(blist.iterator(), null);
// get label // get label