enable ROCm on latest XGBoost

2023-10-23 11:07:08 -07:00
parent fb19e15ce3 3b86260b50
commit 15421e40d9
328 changed files with 8028 additions and 3642 deletions
--- a/demo/c-api/basic/CMakeLists.txt
+++ b/demo/c-api/basic/CMakeLists.txt
@@ -3,11 +3,11 @@ find_package(xgboost REQUIRED)

 # xgboost is built as static libraries, all cxx dependencies need to be linked into the
 # executable.
-if (XGBOOST_BUILD_STATIC_LIB)
+if(XGBOOST_BUILD_STATIC_LIB)
  enable_language(CXX)
  # find again for those  cxx libraries.
  find_package(xgboost REQUIRED)
-endif(XGBOOST_BUILD_STATIC_LIB)
+endif()

 add_executable(api-demo c-api-demo.c)
 target_link_libraries(api-demo PRIVATE xgboost::xgboost)
--- a/demo/c-api/inference/CMakeLists.txt
+++ b/demo/c-api/inference/CMakeLists.txt
@@ -4,11 +4,11 @@ find_package(xgboost REQUIRED)

 # xgboost is built as static libraries, all cxx dependencies need to be linked into the
 # executable.
-if (XGBOOST_BUILD_STATIC_LIB)
+if(XGBOOST_BUILD_STATIC_LIB)
  enable_language(CXX)
  # find again for those  cxx libraries.
  find_package(xgboost REQUIRED)
-endif(XGBOOST_BUILD_STATIC_LIB)
+endif()

 add_executable(inference-demo inference.c)
 target_link_libraries(inference-demo PRIVATE xgboost::xgboost)
--- a/demo/guide-python/callbacks.py
+++ b/demo/guide-python/callbacks.py
@@ -104,7 +104,7 @@ def check_point_callback():
        # Use callback class from xgboost.callback
        # Feel free to subclass/customize it to suit your need.
        check_point = xgb.callback.TrainingCheckPoint(
-            directory=tmpdir, iterations=rounds, name="model"
+            directory=tmpdir, interval=rounds, name="model"
        )
        xgb.train(
            {"objective": "binary:logistic"},
@@ -118,7 +118,7 @@ def check_point_callback():
        # This version of checkpoint saves everything including parameters and
        # model.  See: doc/tutorials/saving_model.rst
        check_point = xgb.callback.TrainingCheckPoint(
-            directory=tmpdir, iterations=rounds, as_pickle=True, name="model"
+            directory=tmpdir, interval=rounds, as_pickle=True, name="model"
        )
        xgb.train(
            {"objective": "binary:logistic"},
--- a/demo/kaggle-higgs/higgs-train.R
+++ b/demo/kaggle-higgs/higgs-train.R
@@ -24,8 +24,8 @@ param <- list("objective" = "binary:logitraw",
              "nthread" = 16)
 watchlist <- list("train" = xgmat)
 nrounds <- 120
-print ("loading data end, start to boost trees")
+print("loading data end, start to boost trees")
 bst <- xgb.train(param, xgmat, nrounds, watchlist)
 # save out model
 xgb.save(bst, "higgs.model")
-print ('finish training')
+print('finish training')
--- a/demo/kaggle-higgs/speedtest.R
+++ b/demo/kaggle-higgs/speedtest.R
@@ -39,11 +39,11 @@ for (i in seq_along(threads)){
                  "nthread" = thread)
    watchlist <- list("train" = xgmat)
    nrounds <- 120
-    print ("loading data end, start to boost trees")
+    print("loading data end, start to boost trees")
    bst <- xgb.train(param, xgmat, nrounds, watchlist)
    # save out model
    xgb.save(bst, "higgs.model")
-    print ('finish training')
+    print('finish training')
  })
 }

--- a/demo/nvflare/horizontal/README.md
+++ b/demo/nvflare/horizontal/README.md
@@ -85,8 +85,8 @@ shutdown server
 ## Training with GPUs

 To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
-Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
-turned off (see the [README](../../plugin/federated/README.md)).
+Build XGBoost with the federated learning plugin enabled along with CUDA
+(see the [README](../../plugin/federated/README.md)).

-Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+Modify `../config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
 above.
--- a/demo/nvflare/horizontal/custom/trainer.py
+++ b/demo/nvflare/horizontal/custom/trainer.py
@@ -67,7 +67,7 @@ class XGBoostTrainer(Executor):
            dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm')

            # Specify parameters via map, definition are same as c++ version
-            param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+            param = {'tree_method': 'hist', 'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
            if self._use_gpus:
                self.log_info(fl_ctx, f'Training with GPU {rank}')
                param['device'] = f"cuda:{rank}"
--- a/demo/nvflare/vertical/README.md
+++ b/demo/nvflare/vertical/README.md
@@ -56,4 +56,9 @@ shutdown server

 ## Training with GPUs

-Currently GPUs are not yet supported by vertical federated XGBoost.
+To demo with Vertical Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
+Build XGBoost with the federated learning plugin enabled along with CUDA
+(see the [README](../../plugin/federated/README.md)).
+
+Modify `../config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+above.
--- a/demo/nvflare/vertical/custom/trainer.py
+++ b/demo/nvflare/vertical/custom/trainer.py
@@ -77,13 +77,14 @@ class XGBoostTrainer(Executor):
                'gamma': 1.0,
                'max_depth': 8,
                'min_child_weight': 100,
-                'tree_method': 'approx',
+                'tree_method': 'hist',
                'grow_policy': 'depthwise',
                'objective': 'binary:logistic',
                'eval_metric': 'auc',
            }
            if self._use_gpus:
-                self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost')
+                self.log_info(fl_ctx, f'Training with GPU {rank}')
+                param['device'] = f"cuda:{rank}"

            # specify validations set to watch performance
            watchlist = [(dtest, "eval"), (dtrain, "train")]