Fix prediction heuristic (#5955)

* Relax check for prediction. * Relax test in spark test. * Add tests in C++.
2020-07-29 19:24:07 +08:00
parent 5879acde9a
commit 75b8c22b0b
11 changed files with 103 additions and 28 deletions
--- a/jvm-packages/.gitignore
+++ b/jvm-packages/.gitignore
@@ -1,3 +1,2 @@
 tracker.py
 build.sh
-
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
@@ -21,19 +21,23 @@ import org.apache.spark.Partitioner
 import org.apache.spark.ml.feature.VectorAssembler
 import org.apache.spark.sql.SparkSession
 import org.scalatest.FunSuite
+import org.apache.spark.sql.functions._

 import scala.util.Random

 class FeatureSizeValidatingSuite extends FunSuite with PerTest {

-  test("transform throwing exception if feature size of dataset is different with model's") {
+  test("transform throwing exception if feature size of dataset is greater than model's") {
    val modelPath = getClass.getResource("/model/0.82/model").getPath
    val model = XGBoostClassificationModel.read.load(modelPath)
    val r = new Random(0)
    // 0.82/model was trained with 251 features. and transform will throw exception
    // if feature size of data is not equal to 251
-    val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
+    var df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
      toDF("feature", "label")
+    for (x <- 1 to 252) {
+      df = df.withColumn(s"feature_${x}", lit(1))
+    }
    val assembler = new VectorAssembler()
      .setInputCols(df.columns.filter(!_.contains("label")))
      .setOutputCol("features")
@@ -67,5 +71,4 @@ class FeatureSizeValidatingSuite extends FunSuite with PerTest {
      xgb.fit(repartitioned)
    }
  }
-
 }