[jvm-packages] Exposed baseMargin (#2450)

* Disabled excessive Spark logging in tests

* Fixed a singature of XGBoostModel.predict

Prior to this commit XGBoostModel.predict produced an RDD with
an array of predictions for each partition, effectively changing
the shape wrt the input RDD. A more natural contract for prediction
API is that given an RDD it returns a new RDD with the same number
of elements. This allows the users to easily match inputs with
predictions.

This commit removes one layer of nesting in XGBoostModel.predict output.
Even though the change is clearly non-backward compatible, I still
think it is well justified.

* Removed boxing in XGBoost.fromDenseToSparseLabeledPoints

* Inlined XGBoost.repartitionData

An if is more explicit than an opaque method name.

* Moved XGBoost.convertBoosterToXGBoostModel to XGBoostModel

* Check the input dimension in DMatrix.setBaseMargin

Prior to this commit providing an array of incorrect dimensions would
have resulted in memory corruption. Maybe backport this to C++?

* Reduced nesting in XGBoost.buildDistributedBoosters

* Ensured consistent naming of the params map

* Cleaned up DataBatch to make it easier to comprehend

* Made scalastyle happy

* Added baseMargin to XGBoost.train and trainWithRDD

* Deprecated XGBoost.train

It is ambiguous and work only for RDDs.

* Addressed review comments

* Revert "Fixed a singature of XGBoostModel.predict"

This reverts commit 06bd5dcae7780265dd57e93ed7d4135f4e78f9b4.

* Addressed more review comments

* Fixed NullPointerException in buildDistributedBoosters
This commit is contained in:
Sergei Lebedev
2017-06-30 17:27:24 +02:00
committed by Nan Zhu
parent 6b287177c8
commit d535340459
8 changed files with 206 additions and 190 deletions

View File

@@ -22,19 +22,22 @@ import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
trait SharedSparkContext extends FunSuite with BeforeAndAfter with BeforeAndAfterAll
with Serializable {
@transient protected implicit var sc: SparkContext = null
@transient protected implicit var sc: SparkContext = _
override def beforeAll() {
// build SparkContext
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("XGBoostSuite").
set("spark.driver.memory", "512m")
val sparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName("XGBoostSuite")
.set("spark.driver.memory", "512m")
.set("spark.ui.enabled", "false")
sc = new SparkContext(sparkConf)
sc.setLogLevel("ERROR")
}
override def afterAll() {
if (sc != null) {
sc.stop()
sc = null
}
}
}

View File

@@ -17,17 +17,15 @@
package ml.dmlc.xgboost4j.scala.spark
import java.nio.file.Files
import java.util.concurrent.{BlockingQueue, LinkedBlockingDeque}
import java.util.concurrent.LinkedBlockingDeque
import scala.collection.mutable.ListBuffer
import scala.io.Source
import scala.util.Random
import scala.concurrent.duration._
import ml.dmlc.xgboost4j.java.{Rabit, DMatrix => JDMatrix, RabitTracker => PyRabitTracker}
import ml.dmlc.xgboost4j.java.{Rabit, DMatrix => JDMatrix}
import ml.dmlc.xgboost4j.scala.DMatrix
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
import org.scalatest.Ignore
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.LabeledPoint
@@ -83,7 +81,8 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
List("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic").toMap,
new java.util.HashMap[String, String](),
numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = true)
numWorkers = 2, round = 5, eval = null, obj = null, useExternalMemory = true,
missing = Float.NaN, baseMargin = null)
val boosterCount = boosterRDD.count()
assert(boosterCount === 2)
cleanExternalCache("XGBoostSuite")
@@ -390,4 +389,30 @@ class XGBoostGeneralSuite extends SharedSparkContext with Utils {
val predResult1: Array[Array[Float]] = predRDD.collect()(0)
assert(testRDD.count() === predResult1.length)
}
test("test use base margin") {
val trainSet = loadLabelPoints(getClass.getResource("/rank-demo-0.txt.train").getFile)
val trainRDD = sc.parallelize(trainSet, numSlices = 1)
val testSet = loadLabelPoints(getClass.getResource("/rank-demo.txt.test").getFile)
val testRDD = sc.parallelize(testSet, numSlices = 1).map(_.features)
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "rank:pairwise")
val trainMargin = {
XGBoost.trainWithRDD(trainRDD, paramMap, round = 1, nWorkers = 2)
.predict(trainRDD.map(_.features), outputMargin = true)
.flatMap { _.flatten.iterator }
}
val xgBoostModel = XGBoost.trainWithRDD(
trainRDD,
paramMap,
round = 1,
nWorkers = 2,
baseMargin = trainMargin)
assert(testRDD.count() === xgBoostModel.predict(testRDD).first().length)
}
}