[jvm-packages] xgboost4j-spark should work when featuresCols is specified (#7789)
This commit is contained in:
@@ -23,6 +23,7 @@ import org.apache.spark.sql._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
import org.apache.spark.Partitioner
|
||||
import org.apache.spark.ml.feature.VectorAssembler
|
||||
|
||||
class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
|
||||
@@ -316,4 +317,77 @@ class XGBoostClassifierSuite extends FunSuite with PerTest {
|
||||
xgb.fit(repartitioned)
|
||||
}
|
||||
|
||||
test("featuresCols with features column can work") {
|
||||
val spark = ss
|
||||
import spark.implicits._
|
||||
val xgbInput = Seq(
|
||||
(Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
|
||||
(Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
|
||||
.toDF("f1", "f2", "f3", "features", "label")
|
||||
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> 1)
|
||||
|
||||
val featuresName = Array("f1", "f2", "f3", "features")
|
||||
val xgbClassifier = new XGBoostClassifier(paramMap)
|
||||
.setFeaturesCol(featuresName)
|
||||
.setLabelCol("label")
|
||||
|
||||
val model = xgbClassifier.fit(xgbInput)
|
||||
assert(model.getFeaturesCols.sameElements(featuresName))
|
||||
|
||||
val df = model.transform(xgbInput)
|
||||
assert(df.schema.fieldNames.contains("features_" + model.uid))
|
||||
df.show()
|
||||
|
||||
val newFeatureName = "features_new"
|
||||
// transform also can work for vectorized dataset
|
||||
val vectorizedInput = new VectorAssembler()
|
||||
.setInputCols(featuresName)
|
||||
.setOutputCol(newFeatureName)
|
||||
.transform(xgbInput)
|
||||
.select(newFeatureName, "label")
|
||||
|
||||
val df1 = model
|
||||
.setFeaturesCol(newFeatureName)
|
||||
.transform(vectorizedInput)
|
||||
assert(df1.schema.fieldNames.contains(newFeatureName))
|
||||
df1.show()
|
||||
}
|
||||
|
||||
test("featuresCols without features column can work") {
|
||||
val spark = ss
|
||||
import spark.implicits._
|
||||
val xgbInput = Seq(
|
||||
(Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
|
||||
(Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
|
||||
.toDF("f1", "f2", "f3", "f4", "label")
|
||||
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> 1)
|
||||
|
||||
val featuresName = Array("f1", "f2", "f3", "f4")
|
||||
val xgbClassifier = new XGBoostClassifier(paramMap)
|
||||
.setFeaturesCol(featuresName)
|
||||
.setLabelCol("label")
|
||||
|
||||
val model = xgbClassifier.fit(xgbInput)
|
||||
assert(model.getFeaturesCols.sameElements(featuresName))
|
||||
|
||||
// transform should work for the dataset which includes the feature column names.
|
||||
val df = model.transform(xgbInput)
|
||||
assert(df.schema.fieldNames.contains("features"))
|
||||
df.show()
|
||||
|
||||
// transform also can work for vectorized dataset
|
||||
val vectorizedInput = new VectorAssembler()
|
||||
.setInputCols(featuresName)
|
||||
.setOutputCol("features")
|
||||
.transform(xgbInput)
|
||||
.select("features", "label")
|
||||
|
||||
val df1 = model.transform(vectorizedInput)
|
||||
df1.show()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2014 by Contributors
|
||||
Copyright (c) 2014-2022 by Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@@ -17,12 +17,15 @@
|
||||
package ml.dmlc.xgboost4j.scala.spark
|
||||
|
||||
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
|
||||
import org.apache.spark.ml.linalg.Vector
|
||||
|
||||
import org.apache.spark.ml.linalg.{Vector, Vectors}
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.{DataFrame, Row}
|
||||
import org.apache.spark.sql.types._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
import org.apache.spark.ml.feature.VectorAssembler
|
||||
|
||||
class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
protected val treeMethod: String = "auto"
|
||||
|
||||
@@ -216,4 +219,77 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
|
||||
assert(resultDF.columns.contains("predictLeaf"))
|
||||
assert(resultDF.columns.contains("predictContrib"))
|
||||
}
|
||||
|
||||
test("featuresCols with features column can work") {
|
||||
val spark = ss
|
||||
import spark.implicits._
|
||||
val xgbInput = Seq(
|
||||
(Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
|
||||
(Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
|
||||
.toDF("f1", "f2", "f3", "features", "label")
|
||||
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> 1)
|
||||
|
||||
val featuresName = Array("f1", "f2", "f3", "features")
|
||||
val xgbClassifier = new XGBoostRegressor(paramMap)
|
||||
.setFeaturesCol(featuresName)
|
||||
.setLabelCol("label")
|
||||
|
||||
val model = xgbClassifier.fit(xgbInput)
|
||||
assert(model.getFeaturesCols.sameElements(featuresName))
|
||||
|
||||
val df = model.transform(xgbInput)
|
||||
assert(df.schema.fieldNames.contains("features_" + model.uid))
|
||||
df.show()
|
||||
|
||||
val newFeatureName = "features_new"
|
||||
// transform also can work for vectorized dataset
|
||||
val vectorizedInput = new VectorAssembler()
|
||||
.setInputCols(featuresName)
|
||||
.setOutputCol(newFeatureName)
|
||||
.transform(xgbInput)
|
||||
.select(newFeatureName, "label")
|
||||
|
||||
val df1 = model
|
||||
.setFeaturesCol(newFeatureName)
|
||||
.transform(vectorizedInput)
|
||||
assert(df1.schema.fieldNames.contains(newFeatureName))
|
||||
df1.show()
|
||||
}
|
||||
|
||||
test("featuresCols without features column can work") {
|
||||
val spark = ss
|
||||
import spark.implicits._
|
||||
val xgbInput = Seq(
|
||||
(Vectors.dense(1.0, 7.0), true, 10.1, 100.2, 0),
|
||||
(Vectors.dense(2.0, 20.0), false, 2.1, 2.2, 1))
|
||||
.toDF("f1", "f2", "f3", "f4", "label")
|
||||
|
||||
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
|
||||
"objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> 1)
|
||||
|
||||
val featuresName = Array("f1", "f2", "f3", "f4")
|
||||
val xgbClassifier = new XGBoostRegressor(paramMap)
|
||||
.setFeaturesCol(featuresName)
|
||||
.setLabelCol("label")
|
||||
|
||||
val model = xgbClassifier.fit(xgbInput)
|
||||
assert(model.getFeaturesCols.sameElements(featuresName))
|
||||
|
||||
// transform should work for the dataset which includes the feature column names.
|
||||
val df = model.transform(xgbInput)
|
||||
assert(df.schema.fieldNames.contains("features"))
|
||||
df.show()
|
||||
|
||||
// transform also can work for vectorized dataset
|
||||
val vectorizedInput = new VectorAssembler()
|
||||
.setInputCols(featuresName)
|
||||
.setOutputCol("features")
|
||||
.transform(xgbInput)
|
||||
.select("features", "label")
|
||||
|
||||
val df1 = model.transform(vectorizedInput)
|
||||
df1.show()
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user