enable ROCm on latest XGBoost

2023-10-23 11:07:08 -07:00
parent fb19e15ce3 3b86260b50
commit 15421e40d9
328 changed files with 8028 additions and 3642 deletions
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG

 # Install all basic requirements
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG

--- a/tests/ci_build/ci_build.sh
+++ b/tests/ci_build/ci_build.sh
@@ -148,10 +148,11 @@ then
    $(aws ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})
    # Pull pre-build container from Docker build cache,
    # if one exists for the particular branch or pull request
-    echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
-    if time docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+    DOCKER_TAG="${BRANCH_NAME//\//-}"  # Slashes are not allow in Docker tag
+    echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
+    if time docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
    then
-      CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+      CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
    else
      # If the build cache is empty of the particular branch or pull request,
      # use the build cache associated with the master branch
@@ -185,8 +186,8 @@ if [[ -n "${DOCKER_CACHE_REPO}" ]]
 then
    # Push the container we just built to the Docker build cache
    # that is associated with the particular branch or pull request
-    echo "docker tag ${DOCKER_IMG_NAME} ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
-    docker tag "${DOCKER_IMG_NAME}" "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+    echo "docker tag ${DOCKER_IMG_NAME} ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
+    docker tag "${DOCKER_IMG_NAME}" "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"

    # Attempt to create Docker repository; it will fail if the repository already exists
    echo "aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION}"
@@ -214,10 +215,10 @@ then
 EOF
    fi

-    echo "docker push --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
-    docker push --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+    echo "docker push --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
+    docker push --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
    if [[ $? != "0" ]]; then
-        echo "ERROR: could not update Docker cache ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+        echo "ERROR: could not update Docker cache ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
        exit 1
    fi
 fi
--- a/tests/ci_build/lint_cmake.sh
+++ b/tests/ci_build/lint_cmake.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+
+cmake_files=$(
+    find . -name CMakeLists.txt -o -path "./cmake/*.cmake" \
+    | grep -v dmlc-core \
+    | grep -v gputreeshap
+)
+cmakelint \
+    --linelength=120 \
+    --filter=-convention/filename,-package/stdargs,-readability/wonkycase \
+    ${cmake_files} \
+|| exit 1
--- a/tests/ci_build/lint_cpp.py
+++ b/tests/ci_build/lint_cpp.py
@@ -0,0 +1,166 @@
+import argparse
+import os
+import re
+import sys
+
+import cpplint
+from cpplint import _cpplint_state
+
+CXX_SUFFIX = set(["cc", "c", "cpp", "h", "cu", "hpp"])
+
+
+def filepath_enumerate(paths):
+    """Enumerate the file paths of all subfiles of the list of paths"""
+    out = []
+    for path in paths:
+        if os.path.isfile(path):
+            out.append(path)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    out.append(os.path.normpath(os.path.join(root, name)))
+    return out
+
+
+def get_header_guard_dmlc(filename):
+    """Get Header Guard Convention for DMLC Projects.
+
+    For headers in include, directly use the path
+    For headers in src, use project name plus path
+
+    Examples: with project-name = dmlc
+        include/dmlc/timer.h -> DMLC_TIMTER_H_
+        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
+    """
+    fileinfo = cpplint.FileInfo(filename)
+    file_path_from_root = fileinfo.RepositoryName()
+    inc_list = ["include", "api", "wrapper", "contrib"]
+    if os.name == "nt":
+        inc_list.append("mshadow")
+
+    if file_path_from_root.find("src/") != -1 and _HELPER.project_name is not None:
+        idx = file_path_from_root.find("src/")
+        file_path_from_root = _HELPER.project_name + file_path_from_root[idx + 3 :]
+    else:
+        idx = file_path_from_root.find("include/")
+        if idx != -1:
+            file_path_from_root = file_path_from_root[idx + 8 :]
+        for spath in inc_list:
+            prefix = spath + "/"
+            if file_path_from_root.startswith(prefix):
+                file_path_from_root = re.sub("^" + prefix, "", file_path_from_root)
+                break
+    return re.sub(r"[-./\s]", "_", file_path_from_root).upper() + "_"
+
+
+class Lint:
+    def __init__(self):
+        self.project_name = "xgboost"
+        self.cpp_header_map = {}
+        self.cpp_src_map = {}
+        self.python_map = {}
+
+        self.pylint_cats = set(["error", "warning", "convention", "refactor"])
+        # setup cpp lint
+        cpplint_args = ["--quiet", "--extensions=" + (",".join(CXX_SUFFIX)), "."]
+        _ = cpplint.ParseArguments(cpplint_args)
+        cpplint._SetFilters(
+            ",".join(
+                [
+                    "-build/c++11",
+                    "-build/include,",
+                    "+build/namespaces",
+                    "+build/include_what_you_use",
+                    "+build/include_order",
+                ]
+            )
+        )
+        cpplint._SetCountingStyle("toplevel")
+        cpplint._line_length = 100
+
+    def process_cpp(self, path, suffix):
+        """Process a cpp file."""
+        _cpplint_state.ResetErrorCounts()
+        cpplint.ProcessFile(str(path), _cpplint_state.verbose_level)
+        _cpplint_state.PrintErrorCounts()
+        errors = _cpplint_state.errors_by_category.copy()
+
+        if suffix == "h":
+            self.cpp_header_map[str(path)] = errors
+        else:
+            self.cpp_src_map[str(path)] = errors
+
+    @staticmethod
+    def _print_summary_map(strm, result_map, ftype):
+        """Print summary of certain result map."""
+        if len(result_map) == 0:
+            return 0
+        npass = sum(1 for x in result_map.values() if len(x) == 0)
+        strm.write(f"====={npass}/{len(result_map)} {ftype} files passed check=====\n")
+        for fname, emap in result_map.items():
+            if len(emap) == 0:
+                continue
+            strm.write(
+                f"{fname}: {sum(emap.values())} Errors of {len(emap)} Categories map={str(emap)}\n"
+            )
+        return len(result_map) - npass
+
+    def print_summary(self, strm):
+        """Print summary of lint."""
+        nerr = 0
+        nerr += Lint._print_summary_map(strm, self.cpp_header_map, "cpp-header")
+        nerr += Lint._print_summary_map(strm, self.cpp_src_map, "cpp-source")
+        if nerr == 0:
+            strm.write("All passed!\n")
+        else:
+            strm.write(f"{nerr} files failed lint\n")
+        return nerr
+
+
+_HELPER = Lint()
+
+cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc
+
+
+def process(fname, allow_type):
+    """Process a file."""
+    fname = str(fname)
+    arr = fname.rsplit(".", 1)
+    if fname.find("#") != -1 or arr[-1] not in allow_type:
+        return
+    if arr[-1] in CXX_SUFFIX:
+        _HELPER.process_cpp(fname, arr[-1])
+
+
+def main():
+    parser = argparse.ArgumentParser(description="run cpp lint")
+    parser.add_argument("path", nargs="+", help="path to traverse")
+    parser.add_argument(
+        "--exclude_path",
+        nargs="+",
+        default=[],
+        help="exclude this path, and all subfolders if path is a folder",
+    )
+    args = parser.parse_args()
+    excluded_paths = filepath_enumerate(args.exclude_path)
+
+    allow_type = []
+    allow_type += CXX_SUFFIX
+
+    for path in args.path:
+        if os.path.isfile(path):
+            normpath = os.path.normpath(path)
+            if normpath not in excluded_paths:
+                process(path, allow_type)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    file_path = os.path.normpath(os.path.join(root, name))
+                    if file_path not in excluded_paths:
+                        process(file_path, allow_type)
+    nerr = _HELPER.print_summary(sys.stderr)
+    sys.exit(nerr > 0)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/ci_build/lint_r.R
+++ b/tests/ci_build/lint_r.R
@@ -28,6 +28,7 @@ my_linters <- list(
  equals_na = lintr::equals_na_linter(),
  fixed_regex = lintr::fixed_regex_linter(),
  for_loop_index = lintr::for_loop_index_linter(),
+  function_left_parentheses = lintr::function_left_parentheses_linter(),
  function_return = lintr::function_return_linter(),
  infix_spaces_linter = lintr::infix_spaces_linter(),
  is_numeric = lintr::is_numeric_linter(),
--- a/tests/ci_build/test_r_package.py
+++ b/tests/ci_build/test_r_package.py
@@ -3,9 +3,15 @@ import argparse
 import os
 import shutil
 import subprocess
+from io import StringIO
 from pathlib import Path
 from platform import system

+try:
+    import pandas as pd
+except ImportError:
+    pd = None
+
 from test_utils import R_PACKAGE, ROOT, DirectoryExcursion, cd, print_time, record_time


@@ -97,16 +103,47 @@ def build_rpackage(path: str) -> str:
    return tarball


+def check_example_timing(rcheck_dir: Path, threshold: float) -> None:
+    with open(rcheck_dir / "xgboost-Ex.timings", "r") as fd:
+        timings = fd.readlines()
+        newlines = []
+        for line in timings:
+            line = line.strip()
+            newlines.append(line)
+        con_timings = "\n".join(newlines)
+        df = pd.read_csv(StringIO(con_timings), delimiter="\t")
+        ratio_n = "user/elapsed"
+        df[ratio_n] = df["user"] / df["elapsed"]
+        offending = df[df[ratio_n] > threshold]
+
+    try:
+        # requires the tabulate package
+        df.to_markdown("timings.md")
+        offending.to_markdown("offending.md")
+    except ImportError:
+        print("failed to export markdown files.")
+        pass
+
+    if offending.shape[0] == 0:
+        return
+
+    print(offending)
+    raise ValueError("There are examples using too many threads")
+
+
@cd(ROOT)
@record_time
 def check_rpackage(path: str) -> None:
    env = os.environ.copy()
    print("Ncpus:", f"{os.cpu_count()}")
+    threshold = 2.5
    env.update(
        {
            "MAKEFLAGS": f"-j{os.cpu_count()}",
            # cran specific environment variables
-            "_R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(2.5),
+            "_R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(threshold),
+            "_R_CHECK_TEST_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(threshold),
+            "_R_CHECK_VIGNETTE_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(threshold),
        }
    )

@@ -118,11 +155,14 @@ def check_rpackage(path: str) -> None:
        CC = os.path.join(mingw_bin, "gcc.exe")
        env.update({"CC": CC, "CXX": CXX})

-    status = subprocess.run([R, "CMD", "check", "--as-cran", path], env=env)
-    with open(Path("xgboost.Rcheck") / "00check.log", "r") as fd:
+    status = subprocess.run(
+        [R, "CMD", "check", "--as-cran", "--timings", path], env=env
+    )
+    rcheck_dir = Path("xgboost.Rcheck")
+    with open(rcheck_dir / "00check.log", "r") as fd:
        check_log = fd.read()

-    with open(Path("xgboost.Rcheck") / "00install.out", "r") as fd:
+    with open(rcheck_dir / "00install.out", "r") as fd:
        install_log = fd.read()

    msg = f"""
@@ -144,6 +184,8 @@ def check_rpackage(path: str) -> None:
    if check_log.find("Examples with CPU time") != -1:
        print(msg)
        raise ValueError("Suspicious NOTE.")
+    if pd is not None:
+        check_example_timing(rcheck_dir, threshold)


@cd(R_PACKAGE)
@@ -264,6 +306,8 @@ def main(args: argparse.Namespace) -> None:
            test_with_autotools()
        else:
            test_with_cmake(args)
+    elif args.task == "timings":
+        check_example_timing(Path("xgboost.Rcheck"), 2.5)
    else:
        raise ValueError("Unexpected task.")

@@ -279,7 +323,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--task",
        type=str,
-        choices=["pack", "build", "check", "doc"],
+        choices=["pack", "build", "check", "doc", "timings"],
        default="check",
        required=False,
    )