[jvm-packages] xgboost4j-spark should work when featuresCols is specified (#7789)

This commit is contained in:
Bobby Wang
2022-04-08 13:21:04 +08:00
committed by GitHub
parent 729d227b89
commit 118192f116
12 changed files with 377 additions and 109 deletions

View File

@@ -23,6 +23,7 @@ import org.apache.spark.sql._
import org.scalatest.FunSuite
import org.apache.spark.Partitioner
import org.apache.spark.ml.feature.VectorAssembler
class XGBoostClassifierSuite extends FunSuite with PerTest {
@@ -316,4 +317,77 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
xgb.fit(repartitioned)
}
test("featuresCols with features column can work") {
val spark = ss
import spark.implicits._
val xgbInput = Seq(
(Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
(Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
.toDF("f1", "f2", "f3", "features", "label")
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> 1)
val featuresName = Array("f1", "f2", "f3", "features")
val xgbClassifier = new XGBoostClassifier(paramMap)
.setFeaturesCol(featuresName)
.setLabelCol("label")
val model = xgbClassifier.fit(xgbInput)
assert(model.getFeaturesCols.sameElements(featuresName))
val df = model.transform(xgbInput)
assert(df.schema.fieldNames.contains("features_" + model.uid))
df.show()
val newFeatureName = "features_new"
// transform also can work for vectorized dataset
val vectorizedInput = new VectorAssembler()
.setInputCols(featuresName)
.setOutputCol(newFeatureName)
.transform(xgbInput)
.select(newFeatureName, "label")
val df1 = model
.setFeaturesCol(newFeatureName)
.transform(vectorizedInput)
assert(df1.schema.fieldNames.contains(newFeatureName))
df1.show()
}
test("featuresCols without features column can work") {
val spark = ss
import spark.implicits._
val xgbInput = Seq(
(Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
(Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
.toDF("f1", "f2", "f3", "f4", "label")
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> 1)
val featuresName = Array("f1", "f2", "f3", "f4")
val xgbClassifier = new XGBoostClassifier(paramMap)
.setFeaturesCol(featuresName)
.setLabelCol("label")
val model = xgbClassifier.fit(xgbInput)
assert(model.getFeaturesCols.sameElements(featuresName))
// transform should work for the dataset which includes the feature column names.
val df = model.transform(xgbInput)
assert(df.schema.fieldNames.contains("features"))
df.show()
// transform also can work for vectorized dataset
val vectorizedInput = new VectorAssembler()
.setInputCols(featuresName)
.setOutputCol("features")
.transform(xgbInput)
.select("features", "label")
val df1 = model.transform(vectorizedInput)
df1.show()
}
}

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2014 by Contributors
Copyright (c) 2014-2022 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -17,12 +17,15 @@
package ml.dmlc.xgboost4j.scala.spark
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types._
import org.scalatest.FunSuite
import org.apache.spark.ml.feature.VectorAssembler
class XGBoostRegressorSuite extends FunSuite with PerTest {
protected val treeMethod: String = "auto"
@@ -216,4 +219,77 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
assert(resultDF.columns.contains("predictLeaf"))
assert(resultDF.columns.contains("predictContrib"))
}
test("featuresCols with features column can work") {
val spark = ss
import spark.implicits._
val xgbInput = Seq(
(Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
(Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
.toDF("f1", "f2", "f3", "features", "label")
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> 1)
val featuresName = Array("f1", "f2", "f3", "features")
val xgbClassifier = new XGBoostRegressor(paramMap)
.setFeaturesCol(featuresName)
.setLabelCol("label")
val model = xgbClassifier.fit(xgbInput)
assert(model.getFeaturesCols.sameElements(featuresName))
val df = model.transform(xgbInput)
assert(df.schema.fieldNames.contains("features_" + model.uid))
df.show()
val newFeatureName = "features_new"
// transform also can work for vectorized dataset
val vectorizedInput = new VectorAssembler()
.setInputCols(featuresName)
.setOutputCol(newFeatureName)
.transform(xgbInput)
.select(newFeatureName, "label")
val df1 = model
.setFeaturesCol(newFeatureName)
.transform(vectorizedInput)
assert(df1.schema.fieldNames.contains(newFeatureName))
df1.show()
}
test("featuresCols without features column can work") {
val spark = ss
import spark.implicits._
val xgbInput = Seq(
(Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
(Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
.toDF("f1", "f2", "f3", "f4", "label")
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> 1)
val featuresName = Array("f1", "f2", "f3", "f4")
val xgbClassifier = new XGBoostRegressor(paramMap)
.setFeaturesCol(featuresName)
.setLabelCol("label")
val model = xgbClassifier.fit(xgbInput)
assert(model.getFeaturesCols.sameElements(featuresName))
// transform should work for the dataset which includes the feature column names.
val df = model.transform(xgbInput)
assert(df.schema.fieldNames.contains("features"))
df.show()
// transform also can work for vectorized dataset
val vectorizedInput = new VectorAssembler()
.setInputCols(featuresName)
.setOutputCol("features")
.transform(xgbInput)
.select("features", "label")
val df1 = model.transform(vectorizedInput)
df1.show()
}
}