[BLOCKING] [jvm-packages] add gpu_hist and enable gpu scheduling (#5171)
* [jvm-packages] add gpu_hist tree method * change updater hist to grow_quantile_histmaker * add gpu scheduling * pass correct parameters to xgboost library * remove debug info * add use.cuda for pom * add CI for gpu_hist for jvm * add gpu unit tests * use gpu node to build jvm * use nvidia-docker * Add CLI interface to create_jni.py using argparse Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -22,7 +22,6 @@ import java.nio.file.Files
|
||||
import scala.collection.{AbstractIterator, mutable}
|
||||
import scala.util.Random
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker}
|
||||
import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
|
||||
import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams
|
||||
@@ -32,7 +31,6 @@ import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
|
||||
import org.apache.commons.io.FileUtils
|
||||
import org.apache.commons.logging.LogFactory
|
||||
import org.apache.hadoop.fs.FileSystem
|
||||
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.{SparkContext, SparkParallelismTracker, TaskContext, TaskFailedListener}
|
||||
import org.apache.spark.sql.SparkSession
|
||||
@@ -76,7 +74,9 @@ private[this] case class XGBoostExecutionParams(
|
||||
checkpointParam: Option[ExternalCheckpointParams],
|
||||
xgbInputParams: XGBoostExecutionInputParams,
|
||||
earlyStoppingParams: XGBoostExecutionEarlyStoppingParams,
|
||||
cacheTrainingSet: Boolean) {
|
||||
cacheTrainingSet: Boolean,
|
||||
treeMethod: Option[String],
|
||||
isLocal: Boolean) {
|
||||
|
||||
private var rawParamMap: Map[String, Any] = _
|
||||
|
||||
@@ -93,6 +93,8 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
|
||||
|
||||
private val logger = LogFactory.getLog("XGBoostSpark")
|
||||
|
||||
private val isLocal = sc.isLocal
|
||||
|
||||
private val overridedParams = overrideParams(rawParams, sc)
|
||||
|
||||
/**
|
||||
@@ -168,11 +170,14 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
|
||||
.getOrElse("allow_non_zero_for_missing", false)
|
||||
.asInstanceOf[Boolean]
|
||||
validateSparkSslConf
|
||||
var treeMethod: Option[String] = None
|
||||
if (overridedParams.contains("tree_method")) {
|
||||
require(overridedParams("tree_method") == "hist" ||
|
||||
overridedParams("tree_method") == "approx" ||
|
||||
overridedParams("tree_method") == "auto", "xgboost4j-spark only supports tree_method as" +
|
||||
" 'hist', 'approx' and 'auto'")
|
||||
overridedParams("tree_method") == "auto" ||
|
||||
overridedParams("tree_method") == "gpu_hist", "xgboost4j-spark only supports tree_method" +
|
||||
" as 'hist', 'approx', 'gpu_hist', and 'auto'")
|
||||
treeMethod = Some(overridedParams("tree_method").asInstanceOf[String])
|
||||
}
|
||||
if (overridedParams.contains("train_test_ratio")) {
|
||||
logger.warn("train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly" +
|
||||
@@ -221,7 +226,9 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
|
||||
checkpointParam,
|
||||
inputParams,
|
||||
xgbExecEarlyStoppingParams,
|
||||
cacheTrainingSet)
|
||||
cacheTrainingSet,
|
||||
treeMethod,
|
||||
isLocal)
|
||||
xgbExecParam.setRawParamMap(overridedParams)
|
||||
xgbExecParam
|
||||
}
|
||||
@@ -335,6 +342,26 @@ object XGBoost extends Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
private def getGPUAddrFromResources: Int = {
|
||||
val tc = TaskContext.get()
|
||||
if (tc == null) {
|
||||
throw new RuntimeException("Something wrong for task context")
|
||||
}
|
||||
val resources = tc.resources()
|
||||
if (resources.contains("gpu")) {
|
||||
val addrs = resources("gpu").addresses
|
||||
if (addrs.size > 1) {
|
||||
// TODO should we throw exception ?
|
||||
logger.warn("XGBoost only supports 1 gpu per worker")
|
||||
}
|
||||
// take the first one
|
||||
addrs.head.toInt
|
||||
} else {
|
||||
throw new RuntimeException("gpu is not allocated by spark, " +
|
||||
"please check if gpu scheduling is enabled")
|
||||
}
|
||||
}
|
||||
|
||||
private def buildDistributedBooster(
|
||||
watches: Watches,
|
||||
xgbExecutionParam: XGBoostExecutionParams,
|
||||
@@ -362,13 +389,25 @@ object XGBoost extends Serializable {
|
||||
val numEarlyStoppingRounds = xgbExecutionParam.earlyStoppingParams.numEarlyStoppingRounds
|
||||
val metrics = Array.tabulate(watches.size)(_ => Array.ofDim[Float](numRounds))
|
||||
val externalCheckpointParams = xgbExecutionParam.checkpointParam
|
||||
|
||||
var params = xgbExecutionParam.toMap
|
||||
if (xgbExecutionParam.treeMethod.exists(m => m == "gpu_hist")) {
|
||||
val gpuId = if (xgbExecutionParam.isLocal) {
|
||||
// For local mode, force gpu id to primary device
|
||||
0
|
||||
} else {
|
||||
getGPUAddrFromResources
|
||||
}
|
||||
logger.info("Leveraging gpu device " + gpuId + " to train")
|
||||
params = params + ("gpu_id" -> gpuId)
|
||||
}
|
||||
val booster = if (makeCheckpoint) {
|
||||
SXGBoost.trainAndSaveCheckpoint(
|
||||
watches.toMap("train"), xgbExecutionParam.toMap, numRounds,
|
||||
watches.toMap("train"), params, numRounds,
|
||||
watches.toMap, metrics, obj, eval,
|
||||
earlyStoppingRound = numEarlyStoppingRounds, prevBooster, externalCheckpointParams)
|
||||
} else {
|
||||
SXGBoost.train(watches.toMap("train"), xgbExecutionParam.toMap, numRounds,
|
||||
SXGBoost.train(watches.toMap("train"), params, numRounds,
|
||||
watches.toMap, metrics, obj, eval,
|
||||
earlyStoppingRound = numEarlyStoppingRounds, prevBooster)
|
||||
}
|
||||
|
||||
@@ -145,11 +145,12 @@ private[spark] trait BoosterParams extends Params {
|
||||
final def getAlpha: Double = $(alpha)
|
||||
|
||||
/**
|
||||
* The tree construction algorithm used in XGBoost. options: {'auto', 'exact', 'approx'}
|
||||
* [default='auto']
|
||||
* The tree construction algorithm used in XGBoost. options:
|
||||
* {'auto', 'exact', 'approx','gpu_hist'} [default='auto']
|
||||
*/
|
||||
final val treeMethod = new Param[String](this, "treeMethod",
|
||||
"The tree construction algorithm used in XGBoost, options: {'auto', 'exact', 'approx', 'hist'}",
|
||||
"The tree construction algorithm used in XGBoost, options: " +
|
||||
"{'auto', 'exact', 'approx', 'hist', 'gpu_hist'}",
|
||||
(value: String) => BoosterParams.supportedTreeMethods.contains(value))
|
||||
|
||||
final def getTreeMethod: String = $(treeMethod)
|
||||
@@ -292,7 +293,7 @@ private[spark] object BoosterParams {
|
||||
|
||||
val supportedBoosters = HashSet("gbtree", "gblinear", "dart")
|
||||
|
||||
val supportedTreeMethods = HashSet("auto", "exact", "approx", "hist")
|
||||
val supportedTreeMethods = HashSet("auto", "exact", "approx", "hist", "gpu_hist")
|
||||
|
||||
val supportedGrowthPolicies = HashSet("depthwise", "lossguide")
|
||||
|
||||
|
||||
@@ -261,10 +261,10 @@ private[spark] trait ParamMapFuncs extends Params {
|
||||
for ((paramName, paramValue) <- xgboostParams) {
|
||||
if ((paramName == "booster" && paramValue != "gbtree") ||
|
||||
(paramName == "updater" && paramValue != "grow_histmaker,prune" &&
|
||||
paramValue != "hist")) {
|
||||
paramValue != "grow_quantile_histmaker" && paramValue != "grow_gpu_hist")) {
|
||||
throw new IllegalArgumentException(s"you specified $paramName as $paramValue," +
|
||||
s" XGBoost-Spark only supports gbtree as booster type" +
|
||||
" and grow_histmaker,prune or hist as the updater type")
|
||||
s" XGBoost-Spark only supports gbtree as booster type and grow_histmaker,prune or" +
|
||||
s" grow_quantile_histmaker or grow_gpu_hist as the updater type")
|
||||
}
|
||||
val name = CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, paramName)
|
||||
params.find(_.name == name).foreach {
|
||||
|
||||
@@ -16,82 +16,16 @@
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.java.GpuTestSuite
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import org.apache.spark.ml.linalg._
|
||||
import org.apache.spark.sql._
|
||||
import org.scalatest.FunSuite
|
||||
import org.apache.spark.Partitioner
|
||||
|
||||
class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
abstract class XGBoostClassifierSuiteBase extends FunSuite with PerTest {
|
||||
|
||||
test("XGBoost-Spark XGBoostClassifier output should match XGBoost4j") {
|
||||
val trainingDM = new DMatrix(Classification.train.iterator)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF)
|
||||
}
|
||||
|
||||
test("XGBoostClassifier should make correct predictions after upstream random sort") {
|
||||
val trainingDM = new DMatrix(Classification.train.iterator)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingDF = buildDataFrameWithRandSort(Classification.train)
|
||||
val testDF = buildDataFrameWithRandSort(Classification.test)
|
||||
checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF)
|
||||
}
|
||||
|
||||
private def checkResultsWithXGBoost4j(
|
||||
trainingDM: DMatrix,
|
||||
testDM: DMatrix,
|
||||
trainingDF: DataFrame,
|
||||
testDF: DataFrame,
|
||||
round: Int = 5): Unit = {
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "binary:logistic")
|
||||
|
||||
val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
|
||||
val prediction1 = model1.predict(testDM)
|
||||
|
||||
val model2 = new XGBoostClassifier(paramMap ++ Array("num_round" -> round,
|
||||
"num_workers" -> numWorkers)).fit(trainingDF)
|
||||
|
||||
val prediction2 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("probability"))).toMap
|
||||
|
||||
assert(testDF.count() === prediction2.size)
|
||||
// the vector length in probability column is 2 since we have to fit to the evaluator in Spark
|
||||
for (i <- prediction1.indices) {
|
||||
assert(prediction1(i).length === prediction2(i).values.length - 1)
|
||||
for (j <- prediction1(i).indices) {
|
||||
assert(prediction1(i)(j) === prediction2(i)(j + 1))
|
||||
}
|
||||
}
|
||||
|
||||
val prediction3 = model1.predict(testDM, outPutMargin = true)
|
||||
val prediction4 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("rawPrediction"))).toMap
|
||||
|
||||
assert(testDF.count() === prediction4.size)
|
||||
// the vector length in rawPrediction column is 2 since we have to fit to the evaluator in Spark
|
||||
for (i <- prediction3.indices) {
|
||||
assert(prediction3(i).length === prediction4(i).values.length - 1)
|
||||
for (j <- prediction3(i).indices) {
|
||||
assert(prediction3(i)(j) === prediction4(i)(j + 1))
|
||||
}
|
||||
}
|
||||
|
||||
// check the equality of single instance prediction
|
||||
val firstOfDM = testDM.slice(Array(0))
|
||||
val firstOfDF = testDF.filter(_.getAs[Int]("id") == 0)
|
||||
.head()
|
||||
.getAs[Vector]("features")
|
||||
val prediction5 = math.round(model1.predict(firstOfDM)(0)(0))
|
||||
val prediction6 = model2.predict(firstOfDF)
|
||||
assert(prediction5 === prediction6)
|
||||
}
|
||||
protected val treeMethod: String = "auto"
|
||||
|
||||
test("Set params in XGBoost and MLlib way should produce same model") {
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
@@ -104,6 +38,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
"silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"num_round" -> round,
|
||||
"tree_method" -> treeMethod,
|
||||
"num_workers" -> numWorkers)
|
||||
|
||||
// Set params in XGBoost way
|
||||
@@ -128,7 +63,8 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
|
||||
test("test schema of XGBoostClassificationModel") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"tree_method" -> treeMethod)
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
|
||||
@@ -160,7 +96,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
test("multi class classification") {
|
||||
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "multi:softmax", "num_class" -> "6", "num_round" -> 5,
|
||||
"num_workers" -> numWorkers)
|
||||
"num_workers" -> numWorkers, "tree_method" -> treeMethod)
|
||||
val trainingDF = buildDataFrame(MultiClassification.train)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model = xgb.fit(trainingDF)
|
||||
@@ -175,7 +111,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "1.0",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
|
||||
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
val model1 = xgb.fit(training1)
|
||||
@@ -194,7 +130,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
test("test predictionLeaf") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val groundTruth = test.count()
|
||||
@@ -209,7 +145,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
test("test predictionLeaf with empty column name") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
@@ -222,7 +158,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
test("test predictionContrib") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val groundTruth = test.count()
|
||||
@@ -237,7 +173,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
test("test predictionContrib with empty column name") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
@@ -250,7 +186,7 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
test("test predictionLeaf and predictionContrib") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "train_test_ratio" -> "0.5",
|
||||
"num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Classification.train)
|
||||
val test = buildDataFrame(Classification.test)
|
||||
val groundTruth = test.count()
|
||||
@@ -264,6 +200,80 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
assert(resultDF.columns.contains("predictContrib"))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class XGBoostCpuClassifierSuite extends XGBoostClassifierSuiteBase {
|
||||
test("XGBoost-Spark XGBoostClassifier output should match XGBoost4j") {
|
||||
val trainingDM = new DMatrix(Classification.train.iterator)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingDF = buildDataFrame(Classification.train)
|
||||
val testDF = buildDataFrame(Classification.test)
|
||||
checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF)
|
||||
}
|
||||
|
||||
test("XGBoostClassifier should make correct predictions after upstream random sort") {
|
||||
val trainingDM = new DMatrix(Classification.train.iterator)
|
||||
val testDM = new DMatrix(Classification.test.iterator)
|
||||
val trainingDF = buildDataFrameWithRandSort(Classification.train)
|
||||
val testDF = buildDataFrameWithRandSort(Classification.test)
|
||||
checkResultsWithXGBoost4j(trainingDM, testDM, trainingDF, testDF)
|
||||
}
|
||||
|
||||
private def checkResultsWithXGBoost4j(
|
||||
trainingDM: DMatrix,
|
||||
testDM: DMatrix,
|
||||
trainingDF: DataFrame,
|
||||
testDF: DataFrame,
|
||||
round: Int = 5): Unit = {
|
||||
val paramMap = Map(
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
"tree_method" -> treeMethod,
|
||||
"max_bin" -> 16)
|
||||
|
||||
val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
|
||||
val prediction1 = model1.predict(testDM)
|
||||
|
||||
val model2 = new XGBoostClassifier(paramMap ++ Array("num_round" -> round,
|
||||
"num_workers" -> numWorkers)).fit(trainingDF)
|
||||
|
||||
val prediction2 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("probability"))).toMap
|
||||
|
||||
assert(testDF.count() === prediction2.size)
|
||||
// the vector length in probability column is 2 since we have to fit to the evaluator in Spark
|
||||
for (i <- prediction1.indices) {
|
||||
assert(prediction1(i).length === prediction2(i).values.length - 1)
|
||||
for (j <- prediction1(i).indices) {
|
||||
assert(prediction1(i)(j) === prediction2(i)(j + 1))
|
||||
}
|
||||
}
|
||||
|
||||
val prediction3 = model1.predict(testDM, outPutMargin = true)
|
||||
val prediction4 = model2.transform(testDF).
|
||||
collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("rawPrediction"))).toMap
|
||||
|
||||
assert(testDF.count() === prediction4.size)
|
||||
// the vector length in rawPrediction column is 2 since we have to fit to the evaluator in Spark
|
||||
for (i <- prediction3.indices) {
|
||||
assert(prediction3(i).length === prediction4(i).values.length - 1)
|
||||
for (j <- prediction3(i).indices) {
|
||||
assert(prediction3(i)(j) === prediction4(i)(j + 1))
|
||||
}
|
||||
}
|
||||
|
||||
// check the equality of single instance prediction
|
||||
val firstOfDM = testDM.slice(Array(0))
|
||||
val firstOfDF = testDF.filter(_.getAs[Int]("id") == 0)
|
||||
.head()
|
||||
.getAs[Vector]("features")
|
||||
val prediction5 = math.round(model1.predict(firstOfDM)(0)(0))
|
||||
val prediction6 = model2.predict(firstOfDF)
|
||||
assert(prediction5 === prediction6)
|
||||
}
|
||||
|
||||
test("infrequent features") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic",
|
||||
@@ -305,5 +315,10 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
val xgb = new XGBoostClassifier(paramMap)
|
||||
xgb.fit(repartitioned)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@GpuTestSuite
|
||||
class XGBoostGpuClassifierSuite extends XGBoostClassifierSuiteBase {
|
||||
override protected val treeMethod: String = "gpu_hist"
|
||||
override protected val numWorkers: Int = 1
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.java.GpuTestSuite
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import org.apache.spark.ml.linalg.Vector
|
||||
import org.apache.spark.sql.functions._
|
||||
@@ -23,7 +24,8 @@ import org.apache.spark.sql.{DataFrame, Row}
|
||||
import org.apache.spark.sql.types._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
abstract class XGBoostRegressorSuiteBase extends FunSuite with PerTest {
|
||||
protected val treeMethod: String = "auto"
|
||||
|
||||
test("XGBoost-Spark XGBoostRegressor output should match XGBoost4j") {
|
||||
val trainingDM = new DMatrix(Regression.train.iterator)
|
||||
@@ -51,7 +53,9 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
"eta" -> "1",
|
||||
"max_depth" -> "6",
|
||||
"silent" -> "1",
|
||||
"objective" -> "reg:squarederror")
|
||||
"objective" -> "reg:squarederror",
|
||||
"max_bin" -> 16,
|
||||
"tree_method" -> treeMethod)
|
||||
|
||||
val model1 = ScalaXGBoost.train(trainingDM, paramMap, round)
|
||||
val prediction1 = model1.predict(testDM)
|
||||
@@ -88,6 +92,7 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
"silent" -> "1",
|
||||
"objective" -> "reg:squarederror",
|
||||
"num_round" -> round,
|
||||
"tree_method" -> treeMethod,
|
||||
"num_workers" -> numWorkers)
|
||||
|
||||
// Set params in XGBoost way
|
||||
@@ -99,6 +104,7 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
.setSilent(1)
|
||||
.setObjective("reg:squarederror")
|
||||
.setNumRound(round)
|
||||
.setTreeMethod(treeMethod)
|
||||
.setNumWorkers(numWorkers)
|
||||
.fit(trainingDF)
|
||||
|
||||
@@ -113,7 +119,7 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
test("ranking: use group data") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "rank:pairwise", "num_workers" -> numWorkers, "num_round" -> 5,
|
||||
"group_col" -> "group")
|
||||
"group_col" -> "group", "tree_method" -> treeMethod)
|
||||
|
||||
val trainingDF = buildDataFrameWithGroup(Ranking.train)
|
||||
val testDF = buildDataFrame(Ranking.test)
|
||||
@@ -125,7 +131,8 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
|
||||
test("use weight") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"tree_method" -> treeMethod)
|
||||
|
||||
val getWeightFromId = udf({id: Int => if (id == 0) 1.0f else 0.001f})
|
||||
val trainingDF = buildDataFrame(Regression.train)
|
||||
@@ -140,7 +147,8 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
|
||||
test("test predictionLeaf") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val groundTruth = testDF.count()
|
||||
@@ -154,7 +162,8 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
|
||||
test("test predictionLeaf with empty column name") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val xgb = new XGBoostRegressor(paramMap)
|
||||
@@ -166,7 +175,8 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
|
||||
test("test predictionContrib") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val groundTruth = testDF.count()
|
||||
@@ -180,7 +190,8 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
|
||||
test("test predictionContrib with empty column name") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val xgb = new XGBoostRegressor(paramMap)
|
||||
@@ -192,7 +203,8 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
|
||||
test("test predictionLeaf and predictionContrib") {
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers)
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
|
||||
"tree_method" -> treeMethod)
|
||||
val training = buildDataFrame(Regression.train)
|
||||
val testDF = buildDataFrame(Regression.test)
|
||||
val groundTruth = testDF.count()
|
||||
@@ -206,3 +218,13 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
assert(resultDF.columns.contains("predictContrib"))
|
||||
}
|
||||
}
|
||||
|
||||
class XGBoostCpuRegressorSuite extends XGBoostRegressorSuiteBase {
|
||||
|
||||
}
|
||||
|
||||
@GpuTestSuite
|
||||
class XGBoostGpuRegressorSuite extends XGBoostRegressorSuiteBase {
|
||||
override protected val treeMethod: String = "gpu_hist"
|
||||
override protected val numWorkers: Int = 1
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user