[jvm-packages] xgboost4j-spark should work when featuresCols is specified (#7789)

2022-04-08 13:21:04 +08:00
parent 729d227b89
commit 118192f116
12 changed files with 377 additions and 109 deletions
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql._
 import org.scalatest.FunSuite

 import org.apache.spark.Partitioner
+import org.apache.spark.ml.feature.VectorAssembler

 class XGBoostClassifierSuite extends FunSuite with PerTest {

@@ -316,4 +317,77 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
    xgb.fit(repartitioned)
  }

+  test("featuresCols with features column can work") {
+    val spark = ss
+    import spark.implicits._
+    val xgbInput = Seq(
+      (Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
+      (Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
+      .toDF("f1", "f2", "f3", "features", "label")
+
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> 1)
+
+    val featuresName = Array("f1", "f2", "f3", "features")
+    val xgbClassifier = new XGBoostClassifier(paramMap)
+      .setFeaturesCol(featuresName)
+      .setLabelCol("label")
+
+    val model = xgbClassifier.fit(xgbInput)
+    assert(model.getFeaturesCols.sameElements(featuresName))
+
+    val df = model.transform(xgbInput)
+    assert(df.schema.fieldNames.contains("features_" + model.uid))
+    df.show()
+
+    val newFeatureName = "features_new"
+    // transform also can work for vectorized dataset
+    val vectorizedInput = new VectorAssembler()
+      .setInputCols(featuresName)
+      .setOutputCol(newFeatureName)
+      .transform(xgbInput)
+      .select(newFeatureName, "label")
+
+    val df1 = model
+      .setFeaturesCol(newFeatureName)
+      .transform(vectorizedInput)
+    assert(df1.schema.fieldNames.contains(newFeatureName))
+    df1.show()
+  }
+
+  test("featuresCols without features column can work") {
+    val spark = ss
+    import spark.implicits._
+    val xgbInput = Seq(
+      (Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
+      (Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
+      .toDF("f1", "f2", "f3", "f4", "label")
+
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> 1)
+
+    val featuresName = Array("f1", "f2", "f3", "f4")
+    val xgbClassifier = new XGBoostClassifier(paramMap)
+      .setFeaturesCol(featuresName)
+      .setLabelCol("label")
+
+    val model = xgbClassifier.fit(xgbInput)
+    assert(model.getFeaturesCols.sameElements(featuresName))
+
+    // transform should work for the dataset which includes the feature column names.
+    val df = model.transform(xgbInput)
+    assert(df.schema.fieldNames.contains("features"))
+    df.show()
+
+    // transform also can work for vectorized dataset
+    val vectorizedInput = new VectorAssembler()
+      .setInputCols(featuresName)
+      .setOutputCol("features")
+      .transform(xgbInput)
+      .select("features", "label")
+
+    val df1 = model.transform(vectorizedInput)
+    df1.show()
+  }
+
 }
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,12 +17,15 @@
 package ml.dmlc.xgboost4j.scala.spark

 import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
-import org.apache.spark.ml.linalg.Vector
+
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.types._
 import org.scalatest.FunSuite

+import org.apache.spark.ml.feature.VectorAssembler
+
 class XGBoostRegressorSuite extends FunSuite with PerTest {
  protected val treeMethod: String = "auto"

@@ -216,4 +219,77 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
    assert(resultDF.columns.contains("predictLeaf"))
    assert(resultDF.columns.contains("predictContrib"))
  }
+
+  test("featuresCols with features column can work") {
+    val spark = ss
+    import spark.implicits._
+    val xgbInput = Seq(
+      (Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
+      (Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
+      .toDF("f1", "f2", "f3", "features", "label")
+
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> 1)
+
+    val featuresName = Array("f1", "f2", "f3", "features")
+    val xgbClassifier = new XGBoostRegressor(paramMap)
+      .setFeaturesCol(featuresName)
+      .setLabelCol("label")
+
+    val model = xgbClassifier.fit(xgbInput)
+    assert(model.getFeaturesCols.sameElements(featuresName))
+
+    val df = model.transform(xgbInput)
+    assert(df.schema.fieldNames.contains("features_" + model.uid))
+    df.show()
+
+    val newFeatureName = "features_new"
+    // transform also can work for vectorized dataset
+    val vectorizedInput = new VectorAssembler()
+      .setInputCols(featuresName)
+      .setOutputCol(newFeatureName)
+      .transform(xgbInput)
+      .select(newFeatureName, "label")
+
+    val df1 = model
+      .setFeaturesCol(newFeatureName)
+      .transform(vectorizedInput)
+    assert(df1.schema.fieldNames.contains(newFeatureName))
+    df1.show()
+  }
+
+  test("featuresCols without features column can work") {
+    val spark = ss
+    import spark.implicits._
+    val xgbInput = Seq(
+      (Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
+      (Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
+      .toDF("f1", "f2", "f3", "f4", "label")
+
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> 1)
+
+    val featuresName = Array("f1", "f2", "f3", "f4")
+    val xgbClassifier = new XGBoostRegressor(paramMap)
+      .setFeaturesCol(featuresName)
+      .setLabelCol("label")
+
+    val model = xgbClassifier.fit(xgbInput)
+    assert(model.getFeaturesCols.sameElements(featuresName))
+
+    // transform should work for the dataset which includes the feature column names.
+    val df = model.transform(xgbInput)
+    assert(df.schema.fieldNames.contains("features"))
+    df.show()
+
+    // transform also can work for vectorized dataset
+    val vectorizedInput = new VectorAssembler()
+      .setInputCols(featuresName)
+      .setOutputCol("features")
+      .transform(xgbInput)
+      .select("features", "label")
+
+    val df1 = model.transform(vectorizedInput)
+    df1.show()
+  }
 }