diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala index 79f3feff8..7f25401b2 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.{Dataset, Row} * XGBoost Estimator to produce a XGBoost model */ class XGBoostEstimator private[spark]( - override val uid: String, private[spark] var xgboostParams: Map[String, Any]) + override val uid: String, xgboostParams: Map[String, Any]) extends Predictor[MLVector, XGBoostEstimator, XGBoostModel] with LearningTaskParams with GeneralParams with BoosterParams { @@ -41,7 +41,6 @@ class XGBoostEstimator private[spark]( def this(uid: String) = this(uid, Map[String, Any]()) - // called in fromXGBParamMapToParams only when eval_metric is not defined private def setupDefaultEvalMetric(): String = { val objFunc = xgboostParams.getOrElse("objective", xgboostParams.getOrElse("obj_type", null)) @@ -93,16 +92,11 @@ class XGBoostEstimator private[spark]( fromXGBParamMapToParams() - // only called when XGBParamMap is empty, i.e. in the constructor this(String) - // TODO: refactor to be functional - private def fromParamsToXGBParamMap(): Map[String, Any] = { - require(xgboostParams.isEmpty, "fromParamsToXGBParamMap can only be called when" + - " XGBParamMap is empty, i.e. in the constructor this(String)") + private[spark] def fromParamsToXGBParamMap: Map[String, Any] = { val xgbParamMap = new mutable.HashMap[String, Any]() for (param <- params) { xgbParamMap += param.name -> $(param) } - xgboostParams = xgbParamMap.toMap xgbParamMap.toMap } @@ -116,8 +110,9 @@ class XGBoostEstimator private[spark]( LabeledPoint(label, feature) } transformSchema(trainingSet.schema, logging = true) - val trainedModel = XGBoost.trainWithRDD(instances, xgboostParams, $(round), $(nWorkers), - $(customObj), $(customEval), $(useExternalMemory), $(missing)).setParent(this) + val trainedModel = XGBoost.trainWithRDD(instances, fromParamsToXGBParamMap, + $(round), $(nWorkers), $(customObj), $(customEval), $(useExternalMemory), + $(missing)).setParent(this) val returnedModel = copyValues(trainedModel) if (XGBoost.isClassificationTask(xgboostParams)) { val numClass = { @@ -133,11 +128,6 @@ class XGBoostEstimator private[spark]( } override def copy(extra: ParamMap): XGBoostEstimator = { - val est = defaultCopy(extra).asInstanceOf[XGBoostEstimator] - // we need to synchronize the params here instead of in the constructor - // because we cannot guarantee that params (default implementation) is initialized fully - // before the other params - est.fromParamsToXGBParamMap() - est + defaultCopy(extra).asInstanceOf[XGBoostEstimator] } } diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala index 79a8c8368..6273a0763 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala @@ -196,7 +196,7 @@ trait BoosterParams extends Params { minChildWeight -> 1, maxDeltaStep -> 0, subSample -> 1, colSampleByTree -> 1, colSampleByLevel -> 1, lambda -> 1, alpha -> 0, treeMethod -> "auto", sketchEps -> 0.03, - scalePosWeight -> 1, sampleType -> "uniform", normalizeType -> "tree", + scalePosWeight -> 1.0, sampleType -> "uniform", normalizeType -> "tree", rateDrop -> 0.0, skipDrop -> 0.0, lambdaBias -> 0) /** diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala index 8a0bed92b..918b0c511 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostDFSuite.scala @@ -18,10 +18,13 @@ package ml.dmlc.xgboost4j.scala.spark import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix} import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost} + import org.apache.spark.SparkContext +import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql._ class XGBoostDFSuite extends SharedSparkContext with Utils { @@ -47,23 +50,21 @@ class XGBoostDFSuite extends SharedSparkContext with Utils { val (testItr, auxTestItr) = loadLabelPoints(getClass.getResource("/agaricus.txt.test").getFile).iterator.duplicate import DataUtils._ + val round = 5 val trainDMatrix = new DMatrix(new JDMatrix(trainingItr, null)) val testDMatrix = new DMatrix(new JDMatrix(testItr, null)) - val xgboostModel = ScalaXGBoost.train(trainDMatrix, paramMap, 5) + val xgboostModel = ScalaXGBoost.train(trainDMatrix, paramMap, round) val predResultFromSeq = xgboostModel.predict(testDMatrix) val testSetItr = auxTestItr.zipWithIndex.map { - case (instance: LabeledPoint, id: Int) => - (id, instance.features, instance.label) + case (instance: LabeledPoint, id: Int) => (id, instance.features, instance.label) } val trainingDF = buildTrainingDataframe() val xgBoostModelWithDF = XGBoost.trainWithDataFrame(trainingDF, paramMap, - round = 5, nWorkers = numWorkers, useExternalMemory = false) + round = round, nWorkers = numWorkers, useExternalMemory = false) val testDF = trainingDF.sparkSession.createDataFrame(testSetItr.toList).toDF( "id", "features", "label") val predResultsFromDF = xgBoostModelWithDF.setExternalMemory(true).transform(testDF). - collect().map(row => - (row.getAs[Int]("id"), row.getAs[DenseVector]("probabilities")) - ).toMap + collect().map(row => (row.getAs[Int]("id"), row.getAs[DenseVector]("probabilities"))).toMap assert(testDF.count() === predResultsFromDF.size) // the vector length in probabilties column is 2 since we have to fit to the evaluator in // Spark @@ -169,8 +170,8 @@ class XGBoostDFSuite extends SharedSparkContext with Utils { assert(xgbEstimator.get(xgbEstimator.objective).get === "binary:logistic") // from spark to xgboost params val xgbEstimatorCopy = xgbEstimator.copy(ParamMap.empty) - assert(xgbEstimatorCopy.xgboostParams.get("eta").get.toString.toDouble === 1.0) - assert(xgbEstimatorCopy.xgboostParams.get("objective").get.toString === "binary:logistic") + assert(xgbEstimatorCopy.fromParamsToXGBParamMap("eta").toString.toDouble === 1.0) + assert(xgbEstimatorCopy.fromParamsToXGBParamMap("objective").toString === "binary:logistic") } test("eval_metric is configured correctly") { @@ -179,11 +180,8 @@ class XGBoostDFSuite extends SharedSparkContext with Utils { assert(xgbEstimator.get(xgbEstimator.evalMetric).get === "error") val sparkParamMap = ParamMap.empty val xgbEstimatorCopy = xgbEstimator.copy(sparkParamMap) - assert(xgbEstimatorCopy.xgboostParams.get("eval_metric") === Some("error")) + assert(xgbEstimatorCopy.fromParamsToXGBParamMap("eval_metric") === "error") val xgbEstimatorCopy1 = xgbEstimator.copy(sparkParamMap.put(xgbEstimator.evalMetric, "logloss")) - assert(xgbEstimatorCopy1.xgboostParams.get("eval_metric") === Some("logloss")) + assert(xgbEstimatorCopy1.fromParamsToXGBParamMap("eval_metric") === "logloss") } - - - }