From 3f2fe25a32f8ca3b1f160488e3a152c2c8649a0f Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Mon, 3 Jun 2019 07:18:16 -0700 Subject: [PATCH] Fix C++11 config parser (#4521) * Fix C++11 config parser * Use raw strings to improve readability of regex * Fix compilation for GCC 5.x Co-authored-by: Jiaming Yuan --- src/cli_main.cc | 3 +- src/common/config.h | 172 ++++++++++++++++++++++++-------- tests/cpp/common/test_config.cc | 169 +++++++++++++++++++++++++++++++ 3 files changed, 298 insertions(+), 46 deletions(-) create mode 100644 tests/cpp/common/test_config.cc diff --git a/src/cli_main.cc b/src/cli_main.cc index ef82ff86f..2901eeec7 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -341,11 +341,10 @@ int CLIRunTask(int argc, char *argv[]) { } rabit::Init(argc, argv); - common::ConfigParse cp(argv[1]); + common::ConfigParser cp(argv[1]); auto cfg = cp.Parse(); cfg.emplace_back("seed", "0"); - for (int i = 2; i < argc; ++i) { char name[256], val[256]; if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) { diff --git a/src/common/config.h b/src/common/config.h index dfbccd1c5..04fcbbdb5 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -2,17 +2,20 @@ * Copyright 2014-2019 by Contributors * \file config.h * \brief helper class to load in configures from file - * \author Haoda Fu + * \author Haoda Fu, Hyunsu Cho */ #ifndef XGBOOST_COMMON_CONFIG_H_ #define XGBOOST_COMMON_CONFIG_H_ +#include #include -#include #include -#include #include +#include +#include #include +#include +#include #include namespace xgboost { @@ -20,66 +23,147 @@ namespace common { /*! * \brief Implementation of config reader */ -class ConfigParse { +class ConfigParser { public: /*! - * \brief constructor - * \param cfgFileName name of configure file + * \brief Constructor for INI-style configuration parser + * \param path path to configuration file */ - explicit ConfigParse(const std::string &cfgFileName) { - fi_.open(cfgFileName); - if (fi_.fail()) { - LOG(FATAL) << "cannot open file " << cfgFileName; - } + explicit ConfigParser(const std::string& path) + : line_comment_regex_("^#"), + key_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx"), + key_regex_escaped_(R"rx(^(["'])([^"'=\r\n]+)\1[\t ]*=)rx"), + value_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*(?:#.*){0,1}$)rx"), + value_regex_escaped_(R"rx(^(["'])([^"'=\r\n]+)\1[\t ]*(?:#.*){0,1}$)rx"), + path_(path) {} + + std::string LoadConfigFile(const std::string& path) { + std::ifstream fin(path, std::ios_base::in | std::ios_base::binary); + CHECK(fin) << "Failed to open: " << path; + std::string content{std::istreambuf_iterator(fin), + std::istreambuf_iterator()}; + return content; } /*! - * \brief parse the configure file + * \brief Normalize end-of-line in a file so that it uses LF for all + * line endings. + * + * This is needed because some OSes use CR or CR LF instead. So we + * replace all CR with LF. + * + * \param p_config_str pointer to configuration */ - std::vector > Parse() { - std::vector > results{}; + std::string NormalizeConfigEOL(std::string const& config_str) { + std::string result; + std::stringstream ss(config_str); + for (size_t i = 0; i < config_str.size(); ++i) { + if (config_str[i] == '\r') { + result.push_back('\n'); + continue; + } + result.push_back(config_str[i]); + } + return result; + } + + /*! + * \brief Parse configuration file into key-value pairs. + * \param path path to configuration file + * \return list of key-value pairs + */ + std::vector> Parse() { + std::string content { LoadConfigFile(path_) }; + content = NormalizeConfigEOL(content); + std::stringstream ss { content }; + std::vector> results; char delimiter = '='; char comment = '#'; - std::string line{}; - std::string name{}; - std::string value{}; - - while (!fi_.eof()) { - std::getline(fi_, line); // read a line of configure file - line = line.substr(0, line.find(comment)); // anything beyond # is comment - size_t delimiterPos = line.find(delimiter); // find the = sign - name = line.substr(0, delimiterPos); // anything before = is the name - // after this = is the value - value = line.substr(delimiterPos + 1, line.length() - delimiterPos - 1); - - if (line.empty() || name.empty() || value.empty()) - continue; // skip a line if # at beginning or there is no value or no name. - CleanString(&name); // clean the string - CleanString(&value); - results.emplace_back(name, value); + std::string line; + std::string key, value; + // Loop over every line of the configuration file + while (std::getline(ss, line)) { + if (ParseKeyValuePair(line, &key, &value)) { + results.emplace_back(key, value); + } } return results; } - ~ConfigParse() { - fi_.close(); - } - private: - std::ifstream fi_; - std::string allowableChar_ = - "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-./\\"; + std::string path_; + const std::regex line_comment_regex_, key_regex_, key_regex_escaped_, + value_regex_, value_regex_escaped_; + + public: + /*! + * \brief Remove leading and trailing whitespaces from a given string + * \param str string + * \return Copy of str with leading and trailing whitespaces removed + */ + static std::string TrimWhitespace(const std::string& str) { + const auto first_char = str.find_first_not_of(" \t\n\r"); + const auto last_char = str.find_last_not_of(" \t\n\r"); + if (first_char == std::string::npos) { + // Every character in str is a whitespace + return std::string(); + } + CHECK_NE(last_char, std::string::npos); + const auto substr_len = last_char + 1 - first_char; + return str.substr(first_char, substr_len); + } /*! - * \brief remove unnecessary chars. + * \brief Parse a key-value pair from a string representing a line + * \param str string (cannot be multi-line) + * \param key place to store the key, if parsing is successful + * \param value place to store the value, if parsing is successful + * \return Whether the parsing was successful */ - void CleanString(std::string * str) { - size_t firstIndx = str->find_first_of(allowableChar_); - size_t lastIndx = str->find_last_of(allowableChar_); - // this line can be more efficient, but keep as is for simplicity. - *str = str->substr(firstIndx, lastIndx - firstIndx + 1); + bool ParseKeyValuePair(const std::string& str, std::string* key, + std::string* value) { + std::string buf = TrimWhitespace(str); + if (buf.empty()) { + return false; + } + + /* Match key */ + std::smatch m; + if (std::regex_search(buf, m, line_comment_regex_)) { + // This line is a comment + return false; + } else if (std::regex_search(buf, m, key_regex_)) { + // Key doesn't have whitespace or # + CHECK_EQ(m.size(), 2); + *key = m[1].str(); + } else if (std::regex_search(buf, m, key_regex_escaped_)) { + // Key has a whitespace and/or #; it has to be wrapped around a pair of + // single or double quotes. Example: "foo bar" 'foo#bar' + CHECK_EQ(m.size(), 3); + *key = m[2].str(); + } else { + LOG(FATAL) << "This line is not a valid key-value pair: " << str; + } + + /* Match value */ + buf = m.suffix().str(); + buf = TrimWhitespace(buf); + if (std::regex_search(buf, m, value_regex_)) { + // Value doesn't have whitespace or # + CHECK_EQ(m.size(), 2); + *value = m[1].str(); + } else if (std::regex_search(buf, m, value_regex_escaped_)) { + // Value has a whitespace and/or #; it has to be wrapped around a pair of + // single or double quotes. Example: "foo bar" 'foo#bar' + CHECK_EQ(m.size(), 3); + *value = m[2].str(); + } else { + LOG(FATAL) << "This line is not a valid key-value pair: " << str; + } + return true; } }; + } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_CONFIG_H_ diff --git a/tests/cpp/common/test_config.cc b/tests/cpp/common/test_config.cc new file mode 100644 index 000000000..542bf2c30 --- /dev/null +++ b/tests/cpp/common/test_config.cc @@ -0,0 +1,169 @@ +/*! + * Copyright 2019 by Contributors + */ +#include +#include +#include +#include +#include "../../../src/common/config.h" +#include "../helpers.h" + +namespace xgboost { +namespace common { + +TEST(ConfigParser, NormalizeConfigEOL) { + // Test whether strings with NL are loaded correctly. + dmlc::TemporaryDirectory tempdir; + const std::string tmp_file = tempdir.path + "/my.conf"; + /* Old Mac OS uses \r for line ending */ + { + std::string const input = "foo\rbar\rdog\r"; + std::string const output = "foo\nbar\ndog\n"; + { + std::ofstream fp( + tmp_file, + std::ios_base::out | std::ios_base::trunc | std::ios_base::binary); + fp << input; + } + { + ConfigParser parser(tmp_file); + auto content = parser.LoadConfigFile(tmp_file); + content = parser.NormalizeConfigEOL(content); + ASSERT_EQ(content, output); + } + } + /* Windows uses \r\n for line ending */ + { + std::string const input = "foo\r\nbar\r\ndog\r\n"; + std::string const output = "foo\n\nbar\n\ndog\n\n"; + { + std::ofstream fp(tmp_file, + std::ios_base::out | std::ios_base::trunc | std::ios_base::binary); + fp << input; + } + { + ConfigParser parser(tmp_file); + auto content = parser.LoadConfigFile(tmp_file); + content = parser.NormalizeConfigEOL(content); + ASSERT_EQ(content, output); + } + } +} + +TEST(ConfigParser, TrimWhitespace) { + ASSERT_EQ(ConfigParser::TrimWhitespace("foo bar"), "foo bar"); + ASSERT_EQ(ConfigParser::TrimWhitespace(" foo bar"), "foo bar"); + ASSERT_EQ(ConfigParser::TrimWhitespace("foo bar "), "foo bar"); + ASSERT_EQ(ConfigParser::TrimWhitespace("foo bar\t"), "foo bar"); + ASSERT_EQ(ConfigParser::TrimWhitespace(" foo bar "), "foo bar"); + ASSERT_EQ(ConfigParser::TrimWhitespace("\t\t foo bar \t"), "foo bar"); + ASSERT_EQ(ConfigParser::TrimWhitespace("\tabc\t"), "abc"); + ASSERT_EQ(ConfigParser::TrimWhitespace("\r abc\t"), "abc"); +} + +TEST(ConfigParser, ParseKeyValuePair) { + // Create dummy configuration file + dmlc::TemporaryDirectory tempdir; + const std::string tmp_file = tempdir.path + "/my.conf"; + { + std::ofstream fp(tmp_file); + fp << ""; + } + + ConfigParser parser(tmp_file); + + std::string key, value; + // 1. Empty lines or comments + ASSERT_FALSE(parser.ParseKeyValuePair("# Mary had a little lamb", + &key, &value)); + ASSERT_FALSE(parser.ParseKeyValuePair("#tree_method = gpu_hist", + &key, &value)); + ASSERT_FALSE(parser.ParseKeyValuePair( + "# minimum sum of instance weight(hessian) needed in a child", + &key, &value)); + ASSERT_FALSE(parser.ParseKeyValuePair("", &key, &value)); + + // 2. Key-value pairs + ASSERT_TRUE(parser.ParseKeyValuePair("booster = gbtree", &key, &value)); + ASSERT_EQ(key, "booster"); + ASSERT_EQ(value, "gbtree"); + ASSERT_TRUE(parser.ParseKeyValuePair("n_gpus = 2", &key, &value)); + ASSERT_EQ(key, "n_gpus"); + ASSERT_EQ(value, "2"); + ASSERT_TRUE(parser.ParseKeyValuePair("monotone_constraints = (1,0,-1)", + &key, &value)); + ASSERT_EQ(key, "monotone_constraints"); + ASSERT_EQ(value, "(1,0,-1)"); + // whitespace should not matter + ASSERT_TRUE(parser.ParseKeyValuePair(" objective=binary:logistic", + &key, &value)); + ASSERT_EQ(key, "objective"); + ASSERT_EQ(value, "binary:logistic"); + ASSERT_TRUE(parser.ParseKeyValuePair("tree_method\t=\thist ", &key, &value)); + ASSERT_EQ(key, "tree_method"); + ASSERT_EQ(value, "hist"); + + // 3. Use of forward and backward slashes in value + ASSERT_TRUE(parser.ParseKeyValuePair("test:data = test/data.libsvm", + &key, &value)); + ASSERT_EQ(key, "test:data"); + ASSERT_EQ(value, "test/data.libsvm"); + ASSERT_TRUE(parser.ParseKeyValuePair("data = C:\\data.libsvm", &key, &value)); + ASSERT_EQ(key, "data"); + ASSERT_EQ(value, "C:\\data.libsvm"); + + // 4. One-line comment + ASSERT_TRUE(parser.ParseKeyValuePair("learning_rate = 0.3 # small step", + &key, &value)); + ASSERT_EQ(key, "learning_rate"); + ASSERT_EQ(value, "0.3"); + // Note: '#' in path won't be accepted correctly unless the whole path is + // wrapped with quotes. This is important for external memory. + ASSERT_TRUE(parser.ParseKeyValuePair("data = dmatrix.libsvm#dtrain.cache", + &key, &value)); + ASSERT_EQ(key, "data"); + ASSERT_EQ(value, "dmatrix.libsvm"); // cache was silently ignored + + // 5. Wrapping key/value with quotes + // Any key or value containing '#' needs to be wrapped with quotes + ASSERT_TRUE(parser.ParseKeyValuePair("data = \"dmatrix.libsvm#dtrain.cache\"", + &key, &value)); + ASSERT_EQ(key, "data"); + ASSERT_EQ(value, "dmatrix.libsvm#dtrain.cache"); // cache is now kept + ASSERT_TRUE(parser.ParseKeyValuePair( + "data = \"C:\\Administrator\\train_file.txt#trainbincache\"", + &key, &value)); + ASSERT_EQ(key, "data"); + ASSERT_EQ(value, "C:\\Administrator\\train_file.txt#trainbincache"); + ASSERT_TRUE(parser.ParseKeyValuePair("\'month#day\' = \"November#2019\"", + &key, &value)); + ASSERT_EQ(key, "month#day"); + ASSERT_EQ(value, "November#2019"); + // Likewise, key or value containing a space needs to be quoted + ASSERT_TRUE(parser.ParseKeyValuePair("\"my data\" = \' so precious! \'", + &key, &value)); + ASSERT_EQ(key, "my data"); + ASSERT_EQ(value, " so precious! "); + ASSERT_TRUE(parser.ParseKeyValuePair("interaction_constraints = " + "\"[[0, 2], [1, 3, 4], [5, 6]]\"", + &key, &value)); + ASSERT_EQ(key, "interaction_constraints"); + ASSERT_EQ(value, "[[0, 2], [1, 3, 4], [5, 6]]"); + + // 6. Unicode + ASSERT_TRUE(parser.ParseKeyValuePair("클래스상속 = 类继承", &key, &value)); + ASSERT_EQ(key, "클래스상속"); + ASSERT_EQ(value, "类继承"); + + // 7. Ill-formed data should throw exception + for (const char* str : {"data = C:\\My Documents\\cat.csv", "cow=", + "C# = 100%", "= woof ", + "interaction_constraints = [[0, 2], [1]]", + "data = \"train.txt#cache", + "data = \'train.txt#cache", "foo = \'bar\""}) { + ASSERT_THROW(parser.ParseKeyValuePair(str, &key, &value), dmlc::Error); + } +} + +} // namespace common +} // namespace xgboost