[jvm-packages] unify the set features API (#7692)
xgboost4j-spark provides 2 sets of API for setting features, one for CPU, another for GPU, which may cause confusion. This PR removes the GPU API and adds an override CPU function setFeaturesCol to accept Array[String] parameters.
This commit is contained in:
parent
c859764d29
commit
e3e6de5ed9
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
Copyright (c) 2021 by Contributors
|
Copyright (c) 2021-2022 by Contributors
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -39,7 +39,7 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
|
|||||||
StructField("f10", FloatType), StructField("f11", FloatType), StructField("f12", FloatType),
|
StructField("f10", FloatType), StructField("f11", FloatType), StructField("f12", FloatType),
|
||||||
StructField(labelName, FloatType)
|
StructField(labelName, FloatType)
|
||||||
))
|
))
|
||||||
val featureNames = schema.fieldNames.filter(s => !s.equals(labelName)).toSeq
|
val featureNames = schema.fieldNames.filter(s => !s.equals(labelName))
|
||||||
|
|
||||||
test("The transform result should be same for several runs on same model") {
|
test("The transform result should be same for several runs on same model") {
|
||||||
withGpuSparkSession(enableCsvConf()) { spark =>
|
withGpuSparkSession(enableCsvConf()) { spark =>
|
||||||
@ -90,7 +90,7 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
|
|||||||
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||||
|
|
||||||
val classifier = new XGBoostClassifier(xgbParam)
|
val classifier = new XGBoostClassifier(xgbParam)
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.setLabelCol(labelName)
|
.setLabelCol(labelName)
|
||||||
.setTreeMethod("gpu_hist")
|
.setTreeMethod("gpu_hist")
|
||||||
(classifier.fit(rawInput), testDf)
|
(classifier.fit(rawInput), testDf)
|
||||||
@ -155,12 +155,12 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
|
|||||||
"please refer to setFeaturesCols"))
|
"please refer to setFeaturesCols"))
|
||||||
|
|
||||||
val left = cpuModel
|
val left = cpuModel
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.transform(testDf)
|
.transform(testDf)
|
||||||
.collect()
|
.collect()
|
||||||
|
|
||||||
val right = cpuModelFromFile
|
val right = cpuModelFromFile
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.transform(testDf)
|
.transform(testDf)
|
||||||
.collect()
|
.collect()
|
||||||
|
|
||||||
@ -177,7 +177,7 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
|
|||||||
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
.csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
|
||||||
|
|
||||||
val classifier = new XGBoostClassifier(xgbParam)
|
val classifier = new XGBoostClassifier(xgbParam)
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.setLabelCol(labelName)
|
.setLabelCol(labelName)
|
||||||
.setTreeMethod("gpu_hist")
|
.setTreeMethod("gpu_hist")
|
||||||
classifier.fit(rawInput)
|
classifier.fit(rawInput)
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
Copyright (c) 2021 by Contributors
|
Copyright (c) 2021-2022 by Contributors
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -28,7 +28,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
|||||||
private val labelName = "label_col"
|
private val labelName = "label_col"
|
||||||
private val weightName = "weight_col"
|
private val weightName = "weight_col"
|
||||||
private val baseMarginName = "margin_col"
|
private val baseMarginName = "margin_col"
|
||||||
private val featureNames = Seq("f1", "f2", "f3")
|
private val featureNames = Array("f1", "f2", "f3")
|
||||||
private val allColumnNames = featureNames :+ weightName :+ baseMarginName :+ labelName
|
private val allColumnNames = featureNames :+ weightName :+ baseMarginName :+ labelName
|
||||||
private val trainingData = Seq(
|
private val trainingData = Seq(
|
||||||
// f1, f2, f3, weight, margin, label
|
// f1, f2, f3, weight, margin, label
|
||||||
@ -68,7 +68,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
|||||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
||||||
new XGBoostClassifier(xgbParam)
|
new XGBoostClassifier(xgbParam)
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.setLabelCol(labelName)
|
.setLabelCol(labelName)
|
||||||
.fit(trainingDf)
|
.fit(trainingDf)
|
||||||
}
|
}
|
||||||
@ -84,7 +84,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
|||||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
||||||
val thrown1 = intercept[IllegalArgumentException] {
|
val thrown1 = intercept[IllegalArgumentException] {
|
||||||
new XGBoostClassifier(xgbParam)
|
new XGBoostClassifier(xgbParam)
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.setLabelCol(labelName)
|
.setLabelCol(labelName)
|
||||||
.fit(trainingDf)
|
.fit(trainingDf)
|
||||||
}
|
}
|
||||||
@ -93,7 +93,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
|||||||
trainingDf = originalDf.withColumn(labelName, col(labelName).cast(StringType))
|
trainingDf = originalDf.withColumn(labelName, col(labelName).cast(StringType))
|
||||||
val thrown2 = intercept[IllegalArgumentException] {
|
val thrown2 = intercept[IllegalArgumentException] {
|
||||||
new XGBoostClassifier(xgbParam)
|
new XGBoostClassifier(xgbParam)
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.setLabelCol(labelName)
|
.setLabelCol(labelName)
|
||||||
.fit(trainingDf)
|
.fit(trainingDf)
|
||||||
}
|
}
|
||||||
@ -117,7 +117,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
|||||||
|
|
||||||
val thrown1 = intercept[IllegalArgumentException] {
|
val thrown1 = intercept[IllegalArgumentException] {
|
||||||
new XGBoostClassifier(xgbParam)
|
new XGBoostClassifier(xgbParam)
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.fit(trainingDf)
|
.fit(trainingDf)
|
||||||
}
|
}
|
||||||
assert(thrown1.getMessage.contains("label does not exist."))
|
assert(thrown1.getMessage.contains("label does not exist."))
|
||||||
@ -132,7 +132,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
|||||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "hist")
|
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "hist")
|
||||||
val thrown = intercept[IllegalArgumentException] {
|
val thrown = intercept[IllegalArgumentException] {
|
||||||
new XGBoostClassifier(xgbParam)
|
new XGBoostClassifier(xgbParam)
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.setLabelCol(labelName)
|
.setLabelCol(labelName)
|
||||||
.fit(trainingDf)
|
.fit(trainingDf)
|
||||||
}
|
}
|
||||||
@ -149,7 +149,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
|||||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
|
||||||
val model1 = new XGBoostClassifier(xgbParam)
|
val model1 = new XGBoostClassifier(xgbParam)
|
||||||
.setFeaturesCols(featureNames)
|
.setFeaturesCol(featureNames)
|
||||||
.setLabelCol(labelName)
|
.setLabelCol(labelName)
|
||||||
.setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
|
.setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
|
||||||
.fit(trainingDf)
|
.fit(trainingDf)
|
||||||
@ -166,7 +166,6 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
|||||||
test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
|
test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
|
||||||
val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
|
val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
|
||||||
withGpuSparkSession() { spark =>
|
withGpuSparkSession() { spark =>
|
||||||
import spark.implicits._
|
|
||||||
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
|
||||||
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist",
|
"num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist",
|
||||||
"features_cols" -> featureNames, "label_col" -> labelName)
|
"features_cols" -> featureNames, "label_col" -> labelName)
|
||||||
@ -174,7 +173,10 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
|
|||||||
xgbc.write.overwrite().save(xgbcPath)
|
xgbc.write.overwrite().save(xgbcPath)
|
||||||
val paramMap2 = XGBoostClassifier.load(xgbcPath).MLlib2XGBoostParams
|
val paramMap2 = XGBoostClassifier.load(xgbcPath).MLlib2XGBoostParams
|
||||||
xgbParam.foreach {
|
xgbParam.foreach {
|
||||||
case (k, v) => assert(v.toString == paramMap2(k).toString)
|
case (k, v: Array[String]) =>
|
||||||
|
assert(v.sameElements(paramMap2(k).asInstanceOf[Array[String]]))
|
||||||
|
case (k, v) =>
|
||||||
|
assert(v.toString == paramMap2(k).toString)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -35,7 +35,7 @@ class GpuXGBoostRegressorSuite extends GpuTestSuite {
|
|||||||
StructField("f3", FloatType),
|
StructField("f3", FloatType),
|
||||||
StructField(groupName, IntegerType)))
|
StructField(groupName, IntegerType)))
|
||||||
val featureNames = schema.fieldNames.filter(s =>
|
val featureNames = schema.fieldNames.filter(s =>
|
||||||
!(s.equals(labelName) || s.equals(groupName))).toSeq
|
!(s.equals(labelName) || s.equals(groupName)))
|
||||||
|
|
||||||
test("The transform result should be same for several runs on same model") {
|
test("The transform result should be same for several runs on same model") {
|
||||||
withGpuSparkSession(enableCsvConf()) { spark =>
|
withGpuSparkSession(enableCsvConf()) { spark =>
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
Copyright (c) 2014,2021 by Contributors
|
Copyright (c) 2014-2022 by Contributors
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -148,7 +148,7 @@ class XGBoostClassifier (
|
|||||||
* This API is only used in GPU train pipeline of xgboost4j-spark-gpu, which requires
|
* This API is only used in GPU train pipeline of xgboost4j-spark-gpu, which requires
|
||||||
* all feature columns must be numeric types.
|
* all feature columns must be numeric types.
|
||||||
*/
|
*/
|
||||||
def setFeaturesCols(value: Seq[String]): this.type =
|
def setFeaturesCol(value: Array[String]): this.type =
|
||||||
set(featuresCols, value)
|
set(featuresCols, value)
|
||||||
|
|
||||||
// called at the start of fit/train when 'eval_metric' is not defined
|
// called at the start of fit/train when 'eval_metric' is not defined
|
||||||
@ -264,7 +264,7 @@ class XGBoostClassificationModel private[ml](
|
|||||||
* This API is only used in GPU train pipeline of xgboost4j-spark-gpu, which requires
|
* This API is only used in GPU train pipeline of xgboost4j-spark-gpu, which requires
|
||||||
* all feature columns must be numeric types.
|
* all feature columns must be numeric types.
|
||||||
*/
|
*/
|
||||||
def setFeaturesCols(value: Seq[String]): this.type =
|
def setFeaturesCol(value: Array[String]): this.type =
|
||||||
set(featuresCols, value)
|
set(featuresCols, value)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
Copyright (c) 2014,2021 by Contributors
|
Copyright (c) 2014-2022 by Contributors
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -150,7 +150,7 @@ class XGBoostRegressor (
|
|||||||
* This API is only used in GPU train pipeline of xgboost4j-spark-gpu, which requires
|
* This API is only used in GPU train pipeline of xgboost4j-spark-gpu, which requires
|
||||||
* all feature columns must be numeric types.
|
* all feature columns must be numeric types.
|
||||||
*/
|
*/
|
||||||
def setFeaturesCols(value: Seq[String]): this.type =
|
def setFeaturesCols(value: Array[String]): this.type =
|
||||||
set(featuresCols, value)
|
set(featuresCols, value)
|
||||||
|
|
||||||
// called at the start of fit/train when 'eval_metric' is not defined
|
// called at the start of fit/train when 'eval_metric' is not defined
|
||||||
@ -257,7 +257,7 @@ class XGBoostRegressionModel private[ml] (
|
|||||||
* This API is only used in GPU train pipeline of xgboost4j-spark-gpu, which requires
|
* This API is only used in GPU train pipeline of xgboost4j-spark-gpu, which requires
|
||||||
* all feature columns must be numeric types.
|
* all feature columns must be numeric types.
|
||||||
*/
|
*/
|
||||||
def setFeaturesCols(value: Seq[String]): this.type =
|
def setFeaturesCols(value: Array[String]): this.type =
|
||||||
set(featuresCols, value)
|
set(featuresCols, value)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
Copyright (c) 2021 by Contributors
|
Copyright (c) 2021-2022 by Contributors
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -16,38 +16,19 @@
|
|||||||
|
|
||||||
package ml.dmlc.xgboost4j.scala.spark.params
|
package ml.dmlc.xgboost4j.scala.spark.params
|
||||||
|
|
||||||
import org.json4s.DefaultFormats
|
import org.apache.spark.ml.param.{Params, StringArrayParam}
|
||||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
|
||||||
|
|
||||||
import org.apache.spark.ml.param.{BooleanParam, Param, Params}
|
|
||||||
|
|
||||||
trait GpuParams extends Params {
|
trait GpuParams extends Params {
|
||||||
/**
|
/**
|
||||||
* Param for the names of feature columns.
|
* Param for the names of feature columns for GPU pipeline.
|
||||||
* @group param
|
* @group param
|
||||||
*/
|
*/
|
||||||
final val featuresCols: StringSeqParam = new StringSeqParam(this, "featuresCols",
|
final val featuresCols: StringArrayParam = new StringArrayParam(this, "featuresCols",
|
||||||
"a sequence of feature column names.")
|
"an array of feature column names for GPU pipeline.")
|
||||||
|
|
||||||
setDefault(featuresCols, Seq.empty[String])
|
setDefault(featuresCols, Array.empty[String])
|
||||||
|
|
||||||
/** @group getParam */
|
/** @group getParam */
|
||||||
final def getFeaturesCols: Seq[String] = $(featuresCols)
|
final def getFeaturesCols: Array[String] = $(featuresCols)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class StringSeqParam(
|
|
||||||
parent: Params,
|
|
||||||
name: String,
|
|
||||||
doc: String) extends Param[Seq[String]](parent, name, doc) {
|
|
||||||
|
|
||||||
override def jsonEncode(value: Seq[String]): String = {
|
|
||||||
import org.json4s.JsonDSL._
|
|
||||||
compact(render(value))
|
|
||||||
}
|
|
||||||
|
|
||||||
override def jsonDecode(json: String): Seq[String] = {
|
|
||||||
implicit val formats = DefaultFormats
|
|
||||||
parse(json).extract[Seq[String]]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user