Fix C++11 config parser (#4521)

* Fix C++11 config parser
* Use raw strings to improve readability of regex
* Fix compilation for GCC 5.x

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
Philip Hyunsu Cho
2019-06-03 07:18:16 -07:00
committed by Jiaming Yuan
parent 23a10c8339
commit 3f2fe25a32
3 changed files with 298 additions and 46 deletions

View File

@@ -2,17 +2,20 @@
* Copyright 2014-2019 by Contributors
* \file config.h
* \brief helper class to load in configures from file
* \author Haoda Fu
* \author Haoda Fu, Hyunsu Cho
*/
#ifndef XGBOOST_COMMON_CONFIG_H_
#define XGBOOST_COMMON_CONFIG_H_
#include <xgboost/logging.h>
#include <cstdio>
#include <cstring>
#include <string>
#include <istream>
#include <fstream>
#include <istream>
#include <sstream>
#include <vector>
#include <regex>
#include <iterator>
#include <utility>
namespace xgboost {
@@ -20,66 +23,147 @@ namespace common {
/*!
* \brief Implementation of config reader
*/
class ConfigParse {
class ConfigParser {
public:
/*!
* \brief constructor
* \param cfgFileName name of configure file
* \brief Constructor for INI-style configuration parser
* \param path path to configuration file
*/
explicit ConfigParse(const std::string &cfgFileName) {
fi_.open(cfgFileName);
if (fi_.fail()) {
LOG(FATAL) << "cannot open file " << cfgFileName;
}
explicit ConfigParser(const std::string& path)
: line_comment_regex_("^#"),
key_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx"),
key_regex_escaped_(R"rx(^(["'])([^"'=\r\n]+)\1[\t ]*=)rx"),
value_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*(?:#.*){0,1}$)rx"),
value_regex_escaped_(R"rx(^(["'])([^"'=\r\n]+)\1[\t ]*(?:#.*){0,1}$)rx"),
path_(path) {}
std::string LoadConfigFile(const std::string& path) {
std::ifstream fin(path, std::ios_base::in | std::ios_base::binary);
CHECK(fin) << "Failed to open: " << path;
std::string content{std::istreambuf_iterator<char>(fin),
std::istreambuf_iterator<char>()};
return content;
}
/*!
* \brief parse the configure file
* \brief Normalize end-of-line in a file so that it uses LF for all
* line endings.
*
* This is needed because some OSes use CR or CR LF instead. So we
* replace all CR with LF.
*
* \param p_config_str pointer to configuration
*/
std::vector<std::pair<std::string, std::string> > Parse() {
std::vector<std::pair<std::string, std::string> > results{};
std::string NormalizeConfigEOL(std::string const& config_str) {
std::string result;
std::stringstream ss(config_str);
for (size_t i = 0; i < config_str.size(); ++i) {
if (config_str[i] == '\r') {
result.push_back('\n');
continue;
}
result.push_back(config_str[i]);
}
return result;
}
/*!
* \brief Parse configuration file into key-value pairs.
* \param path path to configuration file
* \return list of key-value pairs
*/
std::vector<std::pair<std::string, std::string>> Parse() {
std::string content { LoadConfigFile(path_) };
content = NormalizeConfigEOL(content);
std::stringstream ss { content };
std::vector<std::pair<std::string, std::string>> results;
char delimiter = '=';
char comment = '#';
std::string line{};
std::string name{};
std::string value{};
while (!fi_.eof()) {
std::getline(fi_, line); // read a line of configure file
line = line.substr(0, line.find(comment)); // anything beyond # is comment
size_t delimiterPos = line.find(delimiter); // find the = sign
name = line.substr(0, delimiterPos); // anything before = is the name
// after this = is the value
value = line.substr(delimiterPos + 1, line.length() - delimiterPos - 1);
if (line.empty() || name.empty() || value.empty())
continue; // skip a line if # at beginning or there is no value or no name.
CleanString(&name); // clean the string
CleanString(&value);
results.emplace_back(name, value);
std::string line;
std::string key, value;
// Loop over every line of the configuration file
while (std::getline(ss, line)) {
if (ParseKeyValuePair(line, &key, &value)) {
results.emplace_back(key, value);
}
}
return results;
}
~ConfigParse() {
fi_.close();
}
private:
std::ifstream fi_;
std::string allowableChar_ =
"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-./\\";
std::string path_;
const std::regex line_comment_regex_, key_regex_, key_regex_escaped_,
value_regex_, value_regex_escaped_;
public:
/*!
* \brief Remove leading and trailing whitespaces from a given string
* \param str string
* \return Copy of str with leading and trailing whitespaces removed
*/
static std::string TrimWhitespace(const std::string& str) {
const auto first_char = str.find_first_not_of(" \t\n\r");
const auto last_char = str.find_last_not_of(" \t\n\r");
if (first_char == std::string::npos) {
// Every character in str is a whitespace
return std::string();
}
CHECK_NE(last_char, std::string::npos);
const auto substr_len = last_char + 1 - first_char;
return str.substr(first_char, substr_len);
}
/*!
* \brief remove unnecessary chars.
* \brief Parse a key-value pair from a string representing a line
* \param str string (cannot be multi-line)
* \param key place to store the key, if parsing is successful
* \param value place to store the value, if parsing is successful
* \return Whether the parsing was successful
*/
void CleanString(std::string * str) {
size_t firstIndx = str->find_first_of(allowableChar_);
size_t lastIndx = str->find_last_of(allowableChar_);
// this line can be more efficient, but keep as is for simplicity.
*str = str->substr(firstIndx, lastIndx - firstIndx + 1);
bool ParseKeyValuePair(const std::string& str, std::string* key,
std::string* value) {
std::string buf = TrimWhitespace(str);
if (buf.empty()) {
return false;
}
/* Match key */
std::smatch m;
if (std::regex_search(buf, m, line_comment_regex_)) {
// This line is a comment
return false;
} else if (std::regex_search(buf, m, key_regex_)) {
// Key doesn't have whitespace or #
CHECK_EQ(m.size(), 2);
*key = m[1].str();
} else if (std::regex_search(buf, m, key_regex_escaped_)) {
// Key has a whitespace and/or #; it has to be wrapped around a pair of
// single or double quotes. Example: "foo bar" 'foo#bar'
CHECK_EQ(m.size(), 3);
*key = m[2].str();
} else {
LOG(FATAL) << "This line is not a valid key-value pair: " << str;
}
/* Match value */
buf = m.suffix().str();
buf = TrimWhitespace(buf);
if (std::regex_search(buf, m, value_regex_)) {
// Value doesn't have whitespace or #
CHECK_EQ(m.size(), 2);
*value = m[1].str();
} else if (std::regex_search(buf, m, value_regex_escaped_)) {
// Value has a whitespace and/or #; it has to be wrapped around a pair of
// single or double quotes. Example: "foo bar" 'foo#bar'
CHECK_EQ(m.size(), 3);
*value = m[2].str();
} else {
LOG(FATAL) << "This line is not a valid key-value pair: " << str;
}
return true;
}
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_CONFIG_H_