Handle UTF-8 paths correctly on Windows platform (#9443)

* Fix round-trip serialization with UTF-8 paths

* Add compiler version check

* Add comment to C API functions

* Add Python tests

* [CI] Updatre MacOS deployment target

* Use std::filesystem instead of dmlc::TemporaryDirectory
This commit is contained in:
Philip Hyunsu Cho 2023-08-07 23:27:25 -07:00 committed by GitHub
parent 97fd5207dd
commit 7ce090e775
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 48 additions and 18 deletions

View File

@ -14,8 +14,24 @@ endif ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUA
message(STATUS "CMake version ${CMAKE_VERSION}")
if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
message(FATAL_ERROR "GCC version must be at least 5.0!")
# Check compiler versions
# Use recent compilers to ensure that std::filesystem is available
if(MSVC)
if(MSVC_VERSION LESS 1920)
message(FATAL_ERROR "Need Visual Studio 2019 or newer to build XGBoost")
endif()
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8.1")
message(FATAL_ERROR "Need GCC 8.1 or newer to build XGBoost")
endif()
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "11.0")
message(FATAL_ERROR "Need Xcode 11.0 (AppleClang 11.0) or newer to build XGBoost")
endif()
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
message(FATAL_ERROR "Need Clang 9.0 or newer to build XGBoost")
endif()
endif()
include(${xgboost_SOURCE_DIR}/cmake/FindPrefetchIntrinsics.cmake)

View File

@ -1221,7 +1221,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *v
* \brief Load model from existing file
*
* \param handle handle
* \param fname File URI or file name.
* \param fname File URI or file name. The string must be UTF-8 encoded.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
@ -1230,7 +1230,7 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
* \brief Save model into existing file
*
* \param handle handle
* \param fname File URI or file name.
* \param fname File URI or file name. The string must be UTF-8 encoded.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,

View File

@ -28,6 +28,7 @@
#include <cstddef> // for size_t
#include <cstdint> // for int32_t, uint32_t
#include <cstring> // for memcpy
#include <filesystem> // for filesystem
#include <fstream> // for ifstream
#include <iterator> // for distance
#include <limits> // for numeric_limits
@ -153,7 +154,7 @@ std::string LoadSequentialFile(std::string uri, bool stream) {
// Open in binary mode so that correct file size can be computed with
// seekg(). This accommodates Windows platform:
// https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
std::ifstream ifs(uri, std::ios_base::binary | std::ios_base::in);
std::ifstream ifs(std::filesystem::u8path(uri), std::ios_base::binary | std::ios_base::in);
if (!ifs) {
// https://stackoverflow.com/a/17338934
OpenErr();

View File

@ -35,7 +35,7 @@ if [[ "$platform_id" == macosx_* ]]; then
# MacOS, Intel
wheel_tag=macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64
cpython_ver=38
export MACOSX_DEPLOYMENT_TARGET=10.13
export MACOSX_DEPLOYMENT_TARGET=10.15
#OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-64/llvm-openmp-11.1.0-hda6cdc1_1.tar.bz2"
OPENMP_URL="https://xgboost-ci-jenkins-artifacts.s3.us-west-2.amazonaws.com/llvm-openmp-11.1.0-hda6cdc1_1-osx-64.tar.bz2"
else

View File

@ -10,6 +10,7 @@
#include <array> // for array
#include <cstddef> // std::size_t
#include <filesystem> // std::filesystem
#include <limits> // std::numeric_limits
#include <string> // std::string
#include <vector>
@ -162,7 +163,7 @@ TEST(CAPI, ConfigIO) {
TEST(CAPI, JsonModelIO) {
size_t constexpr kRows = 10;
size_t constexpr kCols = 10;
dmlc::TemporaryDirectory tempdir;
auto tempdir = std::filesystem::temp_directory_path();
auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
std::vector<std::shared_ptr<DMatrix>> mat {p_dmat};
@ -178,19 +179,19 @@ TEST(CAPI, JsonModelIO) {
learner->UpdateOneIter(0, p_dmat);
BoosterHandle handle = learner.get();
std::string modelfile_0 = tempdir.path + "/model_0.json";
XGBoosterSaveModel(handle, modelfile_0.c_str());
XGBoosterLoadModel(handle, modelfile_0.c_str());
auto modelfile_0 = tempdir / std::filesystem::u8path(u8"모델_0.json");
XGBoosterSaveModel(handle, modelfile_0.u8string().c_str());
XGBoosterLoadModel(handle, modelfile_0.u8string().c_str());
bst_ulong num_feature {0};
ASSERT_EQ(XGBoosterGetNumFeature(handle, &num_feature), 0);
ASSERT_EQ(num_feature, kCols);
std::string modelfile_1 = tempdir.path + "/model_1.json";
XGBoosterSaveModel(handle, modelfile_1.c_str());
auto modelfile_1 = tempdir / "model_1.json";
XGBoosterSaveModel(handle, modelfile_1.u8string().c_str());
auto model_str_0 = common::LoadSequentialFile(modelfile_0);
auto model_str_1 = common::LoadSequentialFile(modelfile_1);
auto model_str_0 = common::LoadSequentialFile(modelfile_0.u8string());
auto model_str_1 = common::LoadSequentialFile(modelfile_1.u8string());
ASSERT_EQ(model_str_0.front(), '{');
ASSERT_EQ(model_str_0, model_str_1);

View File

@ -1,5 +1,6 @@
import json
import os
import pathlib
import tempfile
from pathlib import Path
@ -167,6 +168,17 @@ class TestBasic:
with pytest.raises(xgb.core.XGBoostError):
xgb.Booster(model_file=u'不正なパス')
@pytest.mark.parametrize("path", ["모델.ubj", "がうる・ぐら.json"], ids=["path-0", "path-1"])
def test_unicode_path(self, tmpdir, path):
model_path = pathlib.Path(tmpdir) / path
dtrain, _ = tm.load_agaricus(__file__)
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
bst = xgb.train(param, dtrain, num_boost_round=2)
bst.save_model(model_path)
bst2 = xgb.Booster(model_file=model_path)
assert bst.get_dump(dump_format="text") == bst2.get_dump(dump_format="text")
def test_dmatrix_numpy_init_omp(self):
rows = [1000, 11326, 15000]