[jvm-packages] Added baseMargin to ml.dmlc.xgboost4j.LabeledPoint (#2532)
* Converted ml.dmlc.xgboost4j.LabeledPoint to Scala
This allows to easily integrate LabeledPoint with Spark DataFrame APIs,
which support encoding/decoding case classes out of the box. Alternative
solution would be to keep LabeledPoint in Java and make it a Bean by
generating boilerplate getters/setters. I have decided against that, even
thought the conversion in this PR implies a public API change.
I also had to remove the factory methods fromSparseVector and
fromDenseVector because a) they would need to be duplicated to support
overloaded calls with extra data (e.g. weight); and b) Scala would expose
them via mangled $.MODULE$ which looks ugly in Java.
Additionally, this commit makes it possible to switch to LabeledPoint in
all public APIs and effectively to pass initial margin/group as part of
the point. This seems to be the only reliable way of implementing distributed
learning with these data. Note that group size format used by single-node
XGBoost is not compatible with that scenario, since the partition split
could divide a group into two chunks.
* Switched to ml.dmlc.xgboost4j.LabeledPoint in RDD-based public APIs
Note that DataFrame-based and Flink APIs are not affected by this change.
* Removed baseMargin argument in favour of the LabeledPoint field
* Do a single pass over the partition in buildDistributedBoosters
Note that there is no formal guarantee that
val repartitioned = rdd.repartition(42)
repartitioned.zipPartitions(repartitioned.map(_ + 1)) { it1, it2, => ... }
would do a single shuffle, but in practice it seems to be always the case.
* Exposed baseMargin in DataFrame-based API
* Addressed review comments
* Pass baseMargin to XGBoost.trainWithDataFrame via params
* Reverted MLLabeledPoint in Spark APIs
As discussed, baseMargin would only be supported for DataFrame-based APIs.
* Cleaned up baseMargin tests
- Removed RDD-based test, since the option is no longer exposed via
public APIs
- Changed DataFrame-based one to check that adding a margin actually
affects the prediction
* Pleased Scalastyle
* Addressed more review comments
* Pleased scalastyle again
* Fixed XGBoost.fromBaseMarginsToArray
which always returned an array of NaNs even if base margin was not
specified. Surprisingly this only failed a few tests.
This commit is contained in:
@@ -16,47 +16,55 @@
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
import ml.dmlc.xgboost4j.LabeledPoint
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
|
||||
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
|
||||
|
||||
object DataUtils extends Serializable {
|
||||
private[spark] implicit class XGBLabeledPointFeatures(
|
||||
val labeledPoint: XGBLabeledPoint
|
||||
) extends AnyVal {
|
||||
/** Converts the point to [[MLLabeledPoint]]. */
|
||||
private[spark] def asML: MLLabeledPoint = {
|
||||
MLLabeledPoint(labeledPoint.label, labeledPoint.features)
|
||||
}
|
||||
|
||||
implicit def fromSparkPointsToXGBoostPointsJava(sps: Iterator[MLLabeledPoint])
|
||||
: java.util.Iterator[LabeledPoint] = {
|
||||
fromSparkPointsToXGBoostPoints(sps).asJava
|
||||
}
|
||||
|
||||
implicit def fromSparkPointsToXGBoostPoints(sps: Iterator[MLLabeledPoint]):
|
||||
Iterator[LabeledPoint] = {
|
||||
for (p <- sps) yield {
|
||||
p.features match {
|
||||
case denseFeature: DenseVector =>
|
||||
LabeledPoint.fromDenseVector(p.label.toFloat, denseFeature.values.map(_.toFloat))
|
||||
case sparseFeature: SparseVector =>
|
||||
LabeledPoint.fromSparseVector(p.label.toFloat, sparseFeature.indices,
|
||||
sparseFeature.values.map(_.toFloat))
|
||||
}
|
||||
/**
|
||||
* Returns feature of the point as [[org.apache.spark.ml.linalg.Vector]].
|
||||
*
|
||||
* If the point is sparse, the dimensionality of the resulting sparse
|
||||
* vector would be [[Int.MaxValue]]. This is the only safe value, since
|
||||
* XGBoost does not store the dimensionality explicitly.
|
||||
*/
|
||||
def features: Vector = if (labeledPoint.indices == null) {
|
||||
Vectors.dense(labeledPoint.values.map(_.toDouble))
|
||||
} else {
|
||||
Vectors.sparse(Int.MaxValue, labeledPoint.indices, labeledPoint.values.map(_.toDouble))
|
||||
}
|
||||
}
|
||||
|
||||
implicit def fromSparkVectorToXGBoostPointsJava(sps: Iterator[Vector])
|
||||
: java.util.Iterator[LabeledPoint] = {
|
||||
fromSparkVectorToXGBoostPoints(sps).asJava
|
||||
private[spark] implicit class MLLabeledPointToXGBLabeledPoint(
|
||||
val labeledPoint: MLLabeledPoint
|
||||
) extends AnyVal {
|
||||
/** Converts an [[MLLabeledPoint]] to an [[XGBLabeledPoint]]. */
|
||||
def asXGB: XGBLabeledPoint = {
|
||||
labeledPoint.features.asXGB.copy(label = labeledPoint.label.toFloat)
|
||||
}
|
||||
}
|
||||
|
||||
implicit def fromSparkVectorToXGBoostPoints(sps: Iterator[Vector])
|
||||
: Iterator[LabeledPoint] = {
|
||||
for (p <- sps) yield {
|
||||
p match {
|
||||
case denseFeature: DenseVector =>
|
||||
LabeledPoint.fromDenseVector(0.0f, denseFeature.values.map(_.toFloat))
|
||||
case sparseFeature: SparseVector =>
|
||||
LabeledPoint.fromSparseVector(0.0f, sparseFeature.indices,
|
||||
sparseFeature.values.map(_.toFloat))
|
||||
}
|
||||
private[spark] implicit class MLVectorToXGBLabeledPoint(val v: Vector) extends AnyVal {
|
||||
/**
|
||||
* Converts a [[Vector]] to a data point with a dummy label.
|
||||
*
|
||||
* This is needed for constructing a [[ml.dmlc.xgboost4j.scala.DMatrix]]
|
||||
* for prediction.
|
||||
*/
|
||||
def asXGB: XGBLabeledPoint = v match {
|
||||
case v: DenseVector =>
|
||||
XGBLabeledPoint(0.0f, null, v.values.map(_.toFloat))
|
||||
case v: SparseVector =>
|
||||
XGBLabeledPoint(0.0f, v.indices, v.values.map(_.toFloat))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,13 +21,13 @@ import scala.collection.mutable
|
||||
import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker}
|
||||
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
|
||||
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
import org.apache.commons.logging.LogFactory
|
||||
import org.apache.hadoop.fs.{FSDataInputStream, Path}
|
||||
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.ml.linalg.SparseVector
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.Dataset
|
||||
import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
|
||||
import org.apache.spark.{SparkContext, TaskContext}
|
||||
|
||||
object TrackerConf {
|
||||
@@ -52,30 +52,49 @@ object XGBoost extends Serializable {
|
||||
private val logger = LogFactory.getLog("XGBoostSpark")
|
||||
|
||||
private def fromDenseToSparseLabeledPoints(
|
||||
denseLabeledPoints: Iterator[MLLabeledPoint],
|
||||
missing: Float): Iterator[MLLabeledPoint] = {
|
||||
denseLabeledPoints: Iterator[XGBLabeledPoint],
|
||||
missing: Float): Iterator[XGBLabeledPoint] = {
|
||||
if (!missing.isNaN) {
|
||||
denseLabeledPoints.map { case MLLabeledPoint(label, features) =>
|
||||
val dFeatures = features.toDense
|
||||
val indices = new mutable.ArrayBuilder.ofInt()
|
||||
val values = new mutable.ArrayBuilder.ofDouble()
|
||||
for (i <- dFeatures.values.indices) {
|
||||
if (dFeatures.values(i) != missing) {
|
||||
indices += i
|
||||
values += dFeatures.values(i)
|
||||
}
|
||||
denseLabeledPoints.map { labeledPoint =>
|
||||
val indicesBuilder = new mutable.ArrayBuilder.ofInt()
|
||||
val valuesBuilder = new mutable.ArrayBuilder.ofFloat()
|
||||
for ((value, i) <- labeledPoint.values.zipWithIndex if value != missing) {
|
||||
indicesBuilder += (if (labeledPoint.indices == null) i else labeledPoint.indices(i))
|
||||
valuesBuilder += value
|
||||
}
|
||||
val sFeatures = new SparseVector(dFeatures.values.length, indices.result(),
|
||||
values.result())
|
||||
MLLabeledPoint(label, sFeatures)
|
||||
labeledPoint.copy(indices = indicesBuilder.result(), values = valuesBuilder.result())
|
||||
}
|
||||
} else {
|
||||
denseLabeledPoints
|
||||
}
|
||||
}
|
||||
|
||||
private def fromBaseMarginsToArray(baseMargins: Iterator[Float]): Option[Array[Float]] = {
|
||||
val builder = new mutable.ArrayBuilder.ofFloat()
|
||||
var nTotal = 0
|
||||
var nUndefined = 0
|
||||
while (baseMargins.hasNext) {
|
||||
nTotal += 1
|
||||
val baseMargin = baseMargins.next()
|
||||
if (baseMargin.isNaN) {
|
||||
nUndefined += 1 // don't waste space for all-NaNs.
|
||||
} else {
|
||||
builder += baseMargin
|
||||
}
|
||||
}
|
||||
if (nUndefined == nTotal) {
|
||||
None
|
||||
} else if (nUndefined == 0) {
|
||||
Some(builder.result())
|
||||
} else {
|
||||
throw new IllegalArgumentException(
|
||||
s"Encountered a partition with $nUndefined NaN base margin values. " +
|
||||
"If you want to specify base margin, ensure all values are non-NaN.")
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] def buildDistributedBoosters(
|
||||
trainingSet: RDD[MLLabeledPoint],
|
||||
trainingSet: RDD[XGBLabeledPoint],
|
||||
params: Map[String, Any],
|
||||
rabitEnv: java.util.Map[String, String],
|
||||
numWorkers: Int,
|
||||
@@ -83,25 +102,20 @@ object XGBoost extends Serializable {
|
||||
obj: ObjectiveTrait,
|
||||
eval: EvalTrait,
|
||||
useExternalMemory: Boolean,
|
||||
missing: Float,
|
||||
baseMargin: RDD[Float]): RDD[Booster] = {
|
||||
import DataUtils._
|
||||
|
||||
missing: Float): RDD[Booster] = {
|
||||
val partitionedTrainingSet = if (trainingSet.getNumPartitions != numWorkers) {
|
||||
logger.info(s"repartitioning training set to $numWorkers partitions")
|
||||
trainingSet.repartition(numWorkers)
|
||||
} else {
|
||||
trainingSet
|
||||
}
|
||||
val partitionedBaseMargin = Option(baseMargin)
|
||||
.getOrElse(trainingSet.sparkContext.emptyRDD)
|
||||
.repartition(partitionedTrainingSet.getNumPartitions)
|
||||
val partitionedBaseMargin = partitionedTrainingSet.map(_.baseMargin)
|
||||
val appName = partitionedTrainingSet.context.appName
|
||||
// to workaround the empty partitions in training dataset,
|
||||
// this might not be the best efficient implementation, see
|
||||
// (https://github.com/dmlc/xgboost/issues/1277)
|
||||
partitionedTrainingSet.zipPartitions(partitionedBaseMargin) { (trainingSamples, baseMargin) =>
|
||||
if (trainingSamples.isEmpty) {
|
||||
partitionedTrainingSet.zipPartitions(partitionedBaseMargin) { (trainingPoints, baseMargins) =>
|
||||
if (trainingPoints.isEmpty) {
|
||||
throw new XGBoostError(
|
||||
s"detected an empty partition in the training data, partition ID:" +
|
||||
s" ${TaskContext.getPartitionId()}")
|
||||
@@ -114,16 +128,15 @@ object XGBoost extends Serializable {
|
||||
}
|
||||
rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString)
|
||||
Rabit.init(rabitEnv)
|
||||
val partitionItr = fromDenseToSparseLabeledPoints(trainingSamples, missing)
|
||||
val trainingMatrix = new DMatrix(partitionItr, cacheFileName)
|
||||
val trainingMatrix = new DMatrix(
|
||||
fromDenseToSparseLabeledPoints(trainingPoints, missing), cacheFileName)
|
||||
try {
|
||||
// TODO: use group attribute from the points.
|
||||
if (params.contains("groupData") && params("groupData") != null) {
|
||||
trainingMatrix.setGroup(params("groupData").asInstanceOf[Seq[Seq[Int]]](
|
||||
TaskContext.getPartitionId()).toArray)
|
||||
}
|
||||
if (baseMargin.nonEmpty) {
|
||||
trainingMatrix.setBaseMargin(baseMargin.toArray)
|
||||
}
|
||||
fromBaseMarginsToArray(baseMargins).foreach(trainingMatrix.setBaseMargin)
|
||||
val booster = SXGBoost.train(trainingMatrix, params, round,
|
||||
watches = Map("train" -> trainingMatrix), obj, eval)
|
||||
Iterator(booster)
|
||||
@@ -199,7 +212,6 @@ object XGBoost extends Serializable {
|
||||
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
|
||||
* true, the user may save the RAM cost for running XGBoost within Spark
|
||||
* @param missing the value represented the missing value in the dataset
|
||||
* @param baseMargin initial prediction for boosting.
|
||||
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
|
||||
* @return XGBoostModel when successful training
|
||||
*/
|
||||
@@ -212,10 +224,9 @@ object XGBoost extends Serializable {
|
||||
obj: ObjectiveTrait = null,
|
||||
eval: EvalTrait = null,
|
||||
useExternalMemory: Boolean = false,
|
||||
missing: Float = Float.NaN,
|
||||
baseMargin: RDD[Float] = null): XGBoostModel = {
|
||||
missing: Float = Float.NaN): XGBoostModel = {
|
||||
trainWithRDD(trainingData, params, round, nWorkers, obj, eval, useExternalMemory,
|
||||
missing, baseMargin)
|
||||
missing)
|
||||
}
|
||||
|
||||
private def overrideParamsAccordingToTaskCPUs(
|
||||
@@ -257,7 +268,6 @@ object XGBoost extends Serializable {
|
||||
* @param useExternalMemory indicate whether to use external memory cache, by setting this flag as
|
||||
* true, the user may save the RAM cost for running XGBoost within Spark
|
||||
* @param missing the value represented the missing value in the dataset
|
||||
* @param baseMargin initial prediction for boosting.
|
||||
* @throws ml.dmlc.xgboost4j.java.XGBoostError when the model training is failed
|
||||
* @return XGBoostModel when successful training
|
||||
*/
|
||||
@@ -270,30 +280,46 @@ object XGBoost extends Serializable {
|
||||
obj: ObjectiveTrait = null,
|
||||
eval: EvalTrait = null,
|
||||
useExternalMemory: Boolean = false,
|
||||
missing: Float = Float.NaN,
|
||||
baseMargin: RDD[Float] = null): XGBoostModel = {
|
||||
missing: Float = Float.NaN): XGBoostModel = {
|
||||
import DataUtils._
|
||||
val xgbTrainingData = trainingData.map { case MLLabeledPoint(label, features) =>
|
||||
features.asXGB.copy(label = label.toFloat)
|
||||
}
|
||||
trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval,
|
||||
useExternalMemory, missing)
|
||||
}
|
||||
|
||||
@throws(classOf[XGBoostError])
|
||||
private[spark] def trainDistributed(
|
||||
trainingData: RDD[XGBLabeledPoint],
|
||||
params: Map[String, Any],
|
||||
round: Int,
|
||||
nWorkers: Int,
|
||||
obj: ObjectiveTrait = null,
|
||||
eval: EvalTrait = null,
|
||||
useExternalMemory: Boolean = false,
|
||||
missing: Float = Float.NaN): XGBoostModel = {
|
||||
if (params.contains("tree_method")) {
|
||||
require(params("tree_method") != "hist", "xgboost4j-spark does not support fast histogram" +
|
||||
" for now")
|
||||
" for now")
|
||||
}
|
||||
require(nWorkers > 0, "you must specify more than 0 workers")
|
||||
if (obj != null) {
|
||||
require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not defined," +
|
||||
" you have to specify the objective type as classification or regression with a" +
|
||||
" customized objective function")
|
||||
" you have to specify the objective type as classification or regression with a" +
|
||||
" customized objective function")
|
||||
}
|
||||
val trackerConf = params.get("tracker_conf") match {
|
||||
case None => TrackerConf()
|
||||
case Some(conf: TrackerConf) => conf
|
||||
case _ => throw new IllegalArgumentException("parameter \"tracker_conf\" must be an " +
|
||||
"instance of TrackerConf.")
|
||||
"instance of TrackerConf.")
|
||||
}
|
||||
val tracker = startTracker(nWorkers, trackerConf)
|
||||
try {
|
||||
val overriddenParams = overrideParamsAccordingToTaskCPUs(params, trainingData.sparkContext)
|
||||
val boosters = buildDistributedBoosters(trainingData, overriddenParams,
|
||||
tracker.getWorkerEnvs, nWorkers, round, obj, eval, useExternalMemory, missing,
|
||||
baseMargin)
|
||||
tracker.getWorkerEnvs, nWorkers, round, obj, eval, useExternalMemory, missing)
|
||||
val sparkJobThread = new Thread() {
|
||||
override def run() {
|
||||
// force the job
|
||||
|
||||
@@ -19,23 +19,23 @@ package ml.dmlc.xgboost4j.scala.spark
|
||||
import scala.collection.mutable
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.spark.params._
|
||||
import org.json4s.DefaultFormats
|
||||
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
|
||||
import org.apache.spark.ml.Predictor
|
||||
import org.apache.spark.ml.feature.LabeledPoint
|
||||
import org.apache.spark.ml.linalg.{Vector => MLVector}
|
||||
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
|
||||
import org.apache.spark.ml.param._
|
||||
import org.apache.spark.ml.util._
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types.DoubleType
|
||||
import org.apache.spark.sql.types.FloatType
|
||||
import org.apache.spark.sql.{Dataset, Row}
|
||||
import org.json4s.DefaultFormats
|
||||
|
||||
/**
|
||||
* XGBoost Estimator to produce a XGBoost model
|
||||
*/
|
||||
class XGBoostEstimator private[spark](
|
||||
override val uid: String, xgboostParams: Map[String, Any])
|
||||
extends Predictor[MLVector, XGBoostEstimator, XGBoostModel]
|
||||
extends Predictor[Vector, XGBoostEstimator, XGBoostModel]
|
||||
with LearningTaskParams with GeneralParams with BoosterParams with MLWritable {
|
||||
|
||||
def this(xgboostParams: Map[String, Any]) =
|
||||
@@ -107,18 +107,32 @@ class XGBoostEstimator private[spark](
|
||||
}
|
||||
}
|
||||
|
||||
private def ensureColumns(trainingSet: Dataset[_]): Dataset[_] = {
|
||||
if (trainingSet.columns.contains($(baseMarginCol))) {
|
||||
trainingSet
|
||||
} else {
|
||||
trainingSet.withColumn($(baseMarginCol), lit(Float.NaN))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* produce a XGBoostModel by fitting the given dataset
|
||||
*/
|
||||
override def train(trainingSet: Dataset[_]): XGBoostModel = {
|
||||
val instances = trainingSet.select(
|
||||
col($(featuresCol)), col($(labelCol)).cast(DoubleType)).rdd.map {
|
||||
case Row(feature: MLVector, label: Double) =>
|
||||
LabeledPoint(label, feature)
|
||||
val instances = ensureColumns(trainingSet).select(
|
||||
col($(featuresCol)),
|
||||
col($(labelCol)).cast(FloatType),
|
||||
col($(baseMarginCol)).cast(FloatType)
|
||||
).rdd.map { case Row(features: Vector, label: Float, baseMargin: Float) =>
|
||||
val (indices, values) = features match {
|
||||
case v: SparseVector => (v.indices, v.values.map(_.toFloat))
|
||||
case v: DenseVector => (null, v.values.map(_.toFloat))
|
||||
}
|
||||
XGBLabeledPoint(label.toFloat, indices, values, baseMargin = baseMargin)
|
||||
}
|
||||
transformSchema(trainingSet.schema, logging = true)
|
||||
val derivedXGBoosterParamMap = fromParamsToXGBParamMap
|
||||
val trainedModel = XGBoost.trainWithRDD(instances, derivedXGBoosterParamMap,
|
||||
val trainedModel = XGBoost.trainDistributed(instances, derivedXGBoosterParamMap,
|
||||
$(round), $(nWorkers), $(customObj), $(customEval), $(useExternalMemory),
|
||||
$(missing)).setParent(this)
|
||||
val returnedModel = copyValues(trainedModel, extractParamMap())
|
||||
|
||||
@@ -21,6 +21,7 @@ import scala.collection.JavaConverters._
|
||||
import ml.dmlc.xgboost4j.java.Rabit
|
||||
import ml.dmlc.xgboost4j.scala.spark.params.{BoosterParams, DefaultXGBoostParamsWriter}
|
||||
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, EvalTrait}
|
||||
|
||||
import org.apache.hadoop.fs.{FSDataOutputStream, Path}
|
||||
|
||||
import org.apache.spark.ml.PredictionModel
|
||||
@@ -66,7 +67,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
|
||||
val rabitEnv = Map("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString)
|
||||
Rabit.init(rabitEnv.asJava)
|
||||
if (testSamples.nonEmpty) {
|
||||
val dMatrix = new DMatrix(testSamples)
|
||||
val dMatrix = new DMatrix(testSamples.map(_.asXGB))
|
||||
try {
|
||||
broadcastBooster.value.predictLeaf(dMatrix).iterator
|
||||
} finally {
|
||||
@@ -103,6 +104,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
|
||||
val appName = evalDataset.context.appName
|
||||
val allEvalMetrics = evalDataset.mapPartitions {
|
||||
labeledPointsPartition =>
|
||||
import DataUtils._
|
||||
if (labeledPointsPartition.hasNext) {
|
||||
val rabitEnv = Map("DMLC_TASK_ID" -> TaskContext.getPartitionId().toString)
|
||||
Rabit.init(rabitEnv.asJava)
|
||||
@@ -114,8 +116,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
|
||||
null
|
||||
}
|
||||
}
|
||||
import DataUtils._
|
||||
val dMatrix = new DMatrix(labeledPointsPartition, cacheFileName)
|
||||
val dMatrix = new DMatrix(labeledPointsPartition.map(_.features.asXGB), cacheFileName)
|
||||
try {
|
||||
if (groupData != null) {
|
||||
dMatrix.setGroup(groupData(TaskContext.getPartitionId()).toArray)
|
||||
@@ -202,7 +203,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
|
||||
null
|
||||
}
|
||||
}
|
||||
val dMatrix = new DMatrix(testSamples, cacheFileName)
|
||||
val dMatrix = new DMatrix(testSamples.map(_.asXGB), cacheFileName)
|
||||
try {
|
||||
broadcastBooster.value.predict(dMatrix).iterator
|
||||
} finally {
|
||||
@@ -250,7 +251,7 @@ abstract class XGBoostModel(protected var _booster: Booster)
|
||||
null
|
||||
}
|
||||
}
|
||||
val testDataset = new DMatrix(vectorIterator, cachePrefix)
|
||||
val testDataset = new DMatrix(vectorIterator.map(_.asXGB), cachePrefix)
|
||||
try {
|
||||
val rawPredictResults = {
|
||||
if (!predLeaf) {
|
||||
|
||||
@@ -60,7 +60,13 @@ trait LearningTaskParams extends Params {
|
||||
val groupData = new GroupDataParam(this, "groupData", "group data specify each group size" +
|
||||
" for ranking task. To correspond to partition of training data, it is nested.")
|
||||
|
||||
setDefault(objective -> "reg:linear", baseScore -> 0.5, numClasses -> 2, groupData -> null)
|
||||
/**
|
||||
* Initial prediction (aka base margin) column name.
|
||||
*/
|
||||
val baseMarginCol = new Param[String](this, "baseMarginCol", "base margin column name")
|
||||
|
||||
setDefault(objective -> "reg:linear", baseScore -> 0.5, numClasses -> 2, groupData -> null,
|
||||
baseMarginCol -> "baseMargin")
|
||||
}
|
||||
|
||||
private[spark] object LearningTaskParams {
|
||||
|
||||
Reference in New Issue
Block a user