diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6a632571e..426daa82c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -210,8 +210,6 @@ jobs: if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows env: RABIT_MOCK: ON - SPARK_LOCAL_IP: 127.0.0.1 - XGBOOST_RABIT_TRACKER_IP_FOR_TEST: 127.0.0.1 lint: runs-on: ubuntu-latest diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index 2d2187e5d..81f7386e8 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -162,17 +162,17 @@ Example of setting a missing value (e.g. -999) to the "missing" parameter in XGB doing this with missing values encoded as NaN, you will want to set ``setHandleInvalid = "keep"`` on VectorAssembler in order to keep the NaN values in the dataset. You would then set the "missing" parameter to whatever you want to be treated as missing. However this may cause a large amount of memory use if your dataset is very sparse. For example: - + .. code-block:: scala val assembler = new VectorAssembler().setInputCols(feature_names.toArray).setOutputCol("features").setHandleInvalid("keep") // conversion to dense vector using Array() - + val featurePipeline = new Pipeline().setStages(Array(assembler)) val featureModel = featurePipeline.fit(df_training) val featureDf = featureModel.transform(df_training) - + val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob", @@ -181,10 +181,10 @@ Example of setting a missing value (e.g. -999) to the "missing" parameter in XGB "num_workers" -> 2, "allow_non_zero_for_missing" -> "true", "missing" -> -999) - + val xgb = new XGBoostClassifier(xgbParam) val xgbclassifier = xgb.fit(featureDf) - + 2. Before calling VectorAssembler you can transform the values you want to represent missing into an irregular value that is not 0, NaN, or Null and set the "missing" parameter to 0. The irregular value should ideally be chosen to be @@ -586,11 +586,3 @@ An equivalent way is to pass in parameters in XGBoostClassifier's constructor: setLabelCol("classIndex") If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file in ``/checkpoints_path`` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. - - -Developer Notes -=============== - -There's an environment variable called ``XGBOOST_RABIT_TRACKER_IP_FOR_TEST`` used to -specify the tracker IP, which can be used in combination with ``SPARK_LOCAL_IP``. It's -only used for testing and is not maintained as a part of the interface. diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/TrackerProperties.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/TrackerProperties.java index 9df51feb7..45a6b1e06 100644 --- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/TrackerProperties.java +++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/TrackerProperties.java @@ -51,8 +51,6 @@ public class TrackerProperties { } public String getHostIp(){ - // mostly for testing - String hostIp = System.getenv("XGBOOST_RABIT_TRACKER_IP_FOR_TEST"); - return hostIp != null ? hostIp : this.properties.getProperty(HOST_IP); + return this.properties.getProperty(HOST_IP); } }