Fix C++11 config parser (#4521)
* Fix C++11 config parser * Use raw strings to improve readability of regex * Fix compilation for GCC 5.x Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
parent
23a10c8339
commit
3f2fe25a32
@ -341,11 +341,10 @@ int CLIRunTask(int argc, char *argv[]) {
|
||||
}
|
||||
rabit::Init(argc, argv);
|
||||
|
||||
common::ConfigParse cp(argv[1]);
|
||||
common::ConfigParser cp(argv[1]);
|
||||
auto cfg = cp.Parse();
|
||||
cfg.emplace_back("seed", "0");
|
||||
|
||||
|
||||
for (int i = 2; i < argc; ++i) {
|
||||
char name[256], val[256];
|
||||
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
|
||||
|
||||
@ -2,17 +2,20 @@
|
||||
* Copyright 2014-2019 by Contributors
|
||||
* \file config.h
|
||||
* \brief helper class to load in configures from file
|
||||
* \author Haoda Fu
|
||||
* \author Haoda Fu, Hyunsu Cho
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_CONFIG_H_
|
||||
#define XGBOOST_COMMON_CONFIG_H_
|
||||
|
||||
#include <xgboost/logging.h>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <istream>
|
||||
#include <fstream>
|
||||
#include <istream>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
#include <iterator>
|
||||
#include <utility>
|
||||
|
||||
namespace xgboost {
|
||||
@ -20,66 +23,147 @@ namespace common {
|
||||
/*!
|
||||
* \brief Implementation of config reader
|
||||
*/
|
||||
class ConfigParse {
|
||||
class ConfigParser {
|
||||
public:
|
||||
/*!
|
||||
* \brief constructor
|
||||
* \param cfgFileName name of configure file
|
||||
* \brief Constructor for INI-style configuration parser
|
||||
* \param path path to configuration file
|
||||
*/
|
||||
explicit ConfigParse(const std::string &cfgFileName) {
|
||||
fi_.open(cfgFileName);
|
||||
if (fi_.fail()) {
|
||||
LOG(FATAL) << "cannot open file " << cfgFileName;
|
||||
}
|
||||
explicit ConfigParser(const std::string& path)
|
||||
: line_comment_regex_("^#"),
|
||||
key_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx"),
|
||||
key_regex_escaped_(R"rx(^(["'])([^"'=\r\n]+)\1[\t ]*=)rx"),
|
||||
value_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*(?:#.*){0,1}$)rx"),
|
||||
value_regex_escaped_(R"rx(^(["'])([^"'=\r\n]+)\1[\t ]*(?:#.*){0,1}$)rx"),
|
||||
path_(path) {}
|
||||
|
||||
std::string LoadConfigFile(const std::string& path) {
|
||||
std::ifstream fin(path, std::ios_base::in | std::ios_base::binary);
|
||||
CHECK(fin) << "Failed to open: " << path;
|
||||
std::string content{std::istreambuf_iterator<char>(fin),
|
||||
std::istreambuf_iterator<char>()};
|
||||
return content;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief parse the configure file
|
||||
* \brief Normalize end-of-line in a file so that it uses LF for all
|
||||
* line endings.
|
||||
*
|
||||
* This is needed because some OSes use CR or CR LF instead. So we
|
||||
* replace all CR with LF.
|
||||
*
|
||||
* \param p_config_str pointer to configuration
|
||||
*/
|
||||
std::vector<std::pair<std::string, std::string> > Parse() {
|
||||
std::vector<std::pair<std::string, std::string> > results{};
|
||||
std::string NormalizeConfigEOL(std::string const& config_str) {
|
||||
std::string result;
|
||||
std::stringstream ss(config_str);
|
||||
for (size_t i = 0; i < config_str.size(); ++i) {
|
||||
if (config_str[i] == '\r') {
|
||||
result.push_back('\n');
|
||||
continue;
|
||||
}
|
||||
result.push_back(config_str[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Parse configuration file into key-value pairs.
|
||||
* \param path path to configuration file
|
||||
* \return list of key-value pairs
|
||||
*/
|
||||
std::vector<std::pair<std::string, std::string>> Parse() {
|
||||
std::string content { LoadConfigFile(path_) };
|
||||
content = NormalizeConfigEOL(content);
|
||||
std::stringstream ss { content };
|
||||
std::vector<std::pair<std::string, std::string>> results;
|
||||
char delimiter = '=';
|
||||
char comment = '#';
|
||||
std::string line{};
|
||||
std::string name{};
|
||||
std::string value{};
|
||||
|
||||
while (!fi_.eof()) {
|
||||
std::getline(fi_, line); // read a line of configure file
|
||||
line = line.substr(0, line.find(comment)); // anything beyond # is comment
|
||||
size_t delimiterPos = line.find(delimiter); // find the = sign
|
||||
name = line.substr(0, delimiterPos); // anything before = is the name
|
||||
// after this = is the value
|
||||
value = line.substr(delimiterPos + 1, line.length() - delimiterPos - 1);
|
||||
|
||||
if (line.empty() || name.empty() || value.empty())
|
||||
continue; // skip a line if # at beginning or there is no value or no name.
|
||||
CleanString(&name); // clean the string
|
||||
CleanString(&value);
|
||||
results.emplace_back(name, value);
|
||||
std::string line;
|
||||
std::string key, value;
|
||||
// Loop over every line of the configuration file
|
||||
while (std::getline(ss, line)) {
|
||||
if (ParseKeyValuePair(line, &key, &value)) {
|
||||
results.emplace_back(key, value);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
~ConfigParse() {
|
||||
fi_.close();
|
||||
}
|
||||
|
||||
private:
|
||||
std::ifstream fi_;
|
||||
std::string allowableChar_ =
|
||||
"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-./\\";
|
||||
std::string path_;
|
||||
const std::regex line_comment_regex_, key_regex_, key_regex_escaped_,
|
||||
value_regex_, value_regex_escaped_;
|
||||
|
||||
public:
|
||||
/*!
|
||||
* \brief Remove leading and trailing whitespaces from a given string
|
||||
* \param str string
|
||||
* \return Copy of str with leading and trailing whitespaces removed
|
||||
*/
|
||||
static std::string TrimWhitespace(const std::string& str) {
|
||||
const auto first_char = str.find_first_not_of(" \t\n\r");
|
||||
const auto last_char = str.find_last_not_of(" \t\n\r");
|
||||
if (first_char == std::string::npos) {
|
||||
// Every character in str is a whitespace
|
||||
return std::string();
|
||||
}
|
||||
CHECK_NE(last_char, std::string::npos);
|
||||
const auto substr_len = last_char + 1 - first_char;
|
||||
return str.substr(first_char, substr_len);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief remove unnecessary chars.
|
||||
* \brief Parse a key-value pair from a string representing a line
|
||||
* \param str string (cannot be multi-line)
|
||||
* \param key place to store the key, if parsing is successful
|
||||
* \param value place to store the value, if parsing is successful
|
||||
* \return Whether the parsing was successful
|
||||
*/
|
||||
void CleanString(std::string * str) {
|
||||
size_t firstIndx = str->find_first_of(allowableChar_);
|
||||
size_t lastIndx = str->find_last_of(allowableChar_);
|
||||
// this line can be more efficient, but keep as is for simplicity.
|
||||
*str = str->substr(firstIndx, lastIndx - firstIndx + 1);
|
||||
bool ParseKeyValuePair(const std::string& str, std::string* key,
|
||||
std::string* value) {
|
||||
std::string buf = TrimWhitespace(str);
|
||||
if (buf.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Match key */
|
||||
std::smatch m;
|
||||
if (std::regex_search(buf, m, line_comment_regex_)) {
|
||||
// This line is a comment
|
||||
return false;
|
||||
} else if (std::regex_search(buf, m, key_regex_)) {
|
||||
// Key doesn't have whitespace or #
|
||||
CHECK_EQ(m.size(), 2);
|
||||
*key = m[1].str();
|
||||
} else if (std::regex_search(buf, m, key_regex_escaped_)) {
|
||||
// Key has a whitespace and/or #; it has to be wrapped around a pair of
|
||||
// single or double quotes. Example: "foo bar" 'foo#bar'
|
||||
CHECK_EQ(m.size(), 3);
|
||||
*key = m[2].str();
|
||||
} else {
|
||||
LOG(FATAL) << "This line is not a valid key-value pair: " << str;
|
||||
}
|
||||
|
||||
/* Match value */
|
||||
buf = m.suffix().str();
|
||||
buf = TrimWhitespace(buf);
|
||||
if (std::regex_search(buf, m, value_regex_)) {
|
||||
// Value doesn't have whitespace or #
|
||||
CHECK_EQ(m.size(), 2);
|
||||
*value = m[1].str();
|
||||
} else if (std::regex_search(buf, m, value_regex_escaped_)) {
|
||||
// Value has a whitespace and/or #; it has to be wrapped around a pair of
|
||||
// single or double quotes. Example: "foo bar" 'foo#bar'
|
||||
CHECK_EQ(m.size(), 3);
|
||||
*value = m[2].str();
|
||||
} else {
|
||||
LOG(FATAL) << "This line is not a valid key-value pair: " << str;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_CONFIG_H_
|
||||
|
||||
169
tests/cpp/common/test_config.cc
Normal file
169
tests/cpp/common/test_config.cc
Normal file
@ -0,0 +1,169 @@
|
||||
/*!
|
||||
* Copyright 2019 by Contributors
|
||||
*/
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <gtest/gtest.h>
|
||||
#include <dmlc/filesystem.h>
|
||||
#include "../../../src/common/config.h"
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
TEST(ConfigParser, NormalizeConfigEOL) {
|
||||
// Test whether strings with NL are loaded correctly.
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/my.conf";
|
||||
/* Old Mac OS uses \r for line ending */
|
||||
{
|
||||
std::string const input = "foo\rbar\rdog\r";
|
||||
std::string const output = "foo\nbar\ndog\n";
|
||||
{
|
||||
std::ofstream fp(
|
||||
tmp_file,
|
||||
std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
|
||||
fp << input;
|
||||
}
|
||||
{
|
||||
ConfigParser parser(tmp_file);
|
||||
auto content = parser.LoadConfigFile(tmp_file);
|
||||
content = parser.NormalizeConfigEOL(content);
|
||||
ASSERT_EQ(content, output);
|
||||
}
|
||||
}
|
||||
/* Windows uses \r\n for line ending */
|
||||
{
|
||||
std::string const input = "foo\r\nbar\r\ndog\r\n";
|
||||
std::string const output = "foo\n\nbar\n\ndog\n\n";
|
||||
{
|
||||
std::ofstream fp(tmp_file,
|
||||
std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
|
||||
fp << input;
|
||||
}
|
||||
{
|
||||
ConfigParser parser(tmp_file);
|
||||
auto content = parser.LoadConfigFile(tmp_file);
|
||||
content = parser.NormalizeConfigEOL(content);
|
||||
ASSERT_EQ(content, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ConfigParser, TrimWhitespace) {
|
||||
ASSERT_EQ(ConfigParser::TrimWhitespace("foo bar"), "foo bar");
|
||||
ASSERT_EQ(ConfigParser::TrimWhitespace(" foo bar"), "foo bar");
|
||||
ASSERT_EQ(ConfigParser::TrimWhitespace("foo bar "), "foo bar");
|
||||
ASSERT_EQ(ConfigParser::TrimWhitespace("foo bar\t"), "foo bar");
|
||||
ASSERT_EQ(ConfigParser::TrimWhitespace(" foo bar "), "foo bar");
|
||||
ASSERT_EQ(ConfigParser::TrimWhitespace("\t\t foo bar \t"), "foo bar");
|
||||
ASSERT_EQ(ConfigParser::TrimWhitespace("\tabc\t"), "abc");
|
||||
ASSERT_EQ(ConfigParser::TrimWhitespace("\r abc\t"), "abc");
|
||||
}
|
||||
|
||||
TEST(ConfigParser, ParseKeyValuePair) {
|
||||
// Create dummy configuration file
|
||||
dmlc::TemporaryDirectory tempdir;
|
||||
const std::string tmp_file = tempdir.path + "/my.conf";
|
||||
{
|
||||
std::ofstream fp(tmp_file);
|
||||
fp << "";
|
||||
}
|
||||
|
||||
ConfigParser parser(tmp_file);
|
||||
|
||||
std::string key, value;
|
||||
// 1. Empty lines or comments
|
||||
ASSERT_FALSE(parser.ParseKeyValuePair("# Mary had a little lamb",
|
||||
&key, &value));
|
||||
ASSERT_FALSE(parser.ParseKeyValuePair("#tree_method = gpu_hist",
|
||||
&key, &value));
|
||||
ASSERT_FALSE(parser.ParseKeyValuePair(
|
||||
"# minimum sum of instance weight(hessian) needed in a child",
|
||||
&key, &value));
|
||||
ASSERT_FALSE(parser.ParseKeyValuePair("", &key, &value));
|
||||
|
||||
// 2. Key-value pairs
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("booster = gbtree", &key, &value));
|
||||
ASSERT_EQ(key, "booster");
|
||||
ASSERT_EQ(value, "gbtree");
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("n_gpus = 2", &key, &value));
|
||||
ASSERT_EQ(key, "n_gpus");
|
||||
ASSERT_EQ(value, "2");
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("monotone_constraints = (1,0,-1)",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "monotone_constraints");
|
||||
ASSERT_EQ(value, "(1,0,-1)");
|
||||
// whitespace should not matter
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair(" objective=binary:logistic",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "objective");
|
||||
ASSERT_EQ(value, "binary:logistic");
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("tree_method\t=\thist ", &key, &value));
|
||||
ASSERT_EQ(key, "tree_method");
|
||||
ASSERT_EQ(value, "hist");
|
||||
|
||||
// 3. Use of forward and backward slashes in value
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("test:data = test/data.libsvm",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "test:data");
|
||||
ASSERT_EQ(value, "test/data.libsvm");
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("data = C:\\data.libsvm", &key, &value));
|
||||
ASSERT_EQ(key, "data");
|
||||
ASSERT_EQ(value, "C:\\data.libsvm");
|
||||
|
||||
// 4. One-line comment
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("learning_rate = 0.3 # small step",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "learning_rate");
|
||||
ASSERT_EQ(value, "0.3");
|
||||
// Note: '#' in path won't be accepted correctly unless the whole path is
|
||||
// wrapped with quotes. This is important for external memory.
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("data = dmatrix.libsvm#dtrain.cache",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "data");
|
||||
ASSERT_EQ(value, "dmatrix.libsvm"); // cache was silently ignored
|
||||
|
||||
// 5. Wrapping key/value with quotes
|
||||
// Any key or value containing '#' needs to be wrapped with quotes
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("data = \"dmatrix.libsvm#dtrain.cache\"",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "data");
|
||||
ASSERT_EQ(value, "dmatrix.libsvm#dtrain.cache"); // cache is now kept
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair(
|
||||
"data = \"C:\\Administrator\\train_file.txt#trainbincache\"",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "data");
|
||||
ASSERT_EQ(value, "C:\\Administrator\\train_file.txt#trainbincache");
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("\'month#day\' = \"November#2019\"",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "month#day");
|
||||
ASSERT_EQ(value, "November#2019");
|
||||
// Likewise, key or value containing a space needs to be quoted
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("\"my data\" = \' so precious! \'",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "my data");
|
||||
ASSERT_EQ(value, " so precious! ");
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("interaction_constraints = "
|
||||
"\"[[0, 2], [1, 3, 4], [5, 6]]\"",
|
||||
&key, &value));
|
||||
ASSERT_EQ(key, "interaction_constraints");
|
||||
ASSERT_EQ(value, "[[0, 2], [1, 3, 4], [5, 6]]");
|
||||
|
||||
// 6. Unicode
|
||||
ASSERT_TRUE(parser.ParseKeyValuePair("클래스상속 = 类继承", &key, &value));
|
||||
ASSERT_EQ(key, "클래스상속");
|
||||
ASSERT_EQ(value, "类继承");
|
||||
|
||||
// 7. Ill-formed data should throw exception
|
||||
for (const char* str : {"data = C:\\My Documents\\cat.csv", "cow=",
|
||||
"C# = 100%", "= woof ",
|
||||
"interaction_constraints = [[0, 2], [1]]",
|
||||
"data = \"train.txt#cache",
|
||||
"data = \'train.txt#cache", "foo = \'bar\""}) {
|
||||
ASSERT_THROW(parser.ParseKeyValuePair(str, &key, &value), dmlc::Error);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
Loading…
x
Reference in New Issue
Block a user