[LIBXGBOOST] pass demo running.
This commit is contained in:
parent
cee148ed64
commit
d75e3ed05d
2
.gitignore
vendored
2
.gitignore
vendored
@ -70,3 +70,5 @@ nb-configuration*
|
||||
.settings/
|
||||
build
|
||||
config.mk
|
||||
xgboost
|
||||
*.data
|
||||
|
||||
23
Makefile
23
Makefile
@ -37,7 +37,7 @@ ifeq ($(OS), Windows_NT)
|
||||
endif
|
||||
|
||||
export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS)
|
||||
export CFLAGS= -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS)
|
||||
export CFLAGS= -std=c++0x -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS)
|
||||
CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include
|
||||
|
||||
ifndef LINT_LANG
|
||||
@ -65,16 +65,27 @@ $(DMLC_CORE)/libdmlc.a:
|
||||
$(RABIT)/lib/$(LIB_RABIT):
|
||||
+ cd $(RABIT); make lib/$(LIB_RABIT); cd $(ROOTDIR)
|
||||
|
||||
|
||||
SRC = $(wildcard src/*.cc src/*/*.cc)
|
||||
ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC))
|
||||
AMALGA_OBJ = amalgamation/xgboost-all0.o
|
||||
LIB_DEP = $(DMLC_CORE)/libdmlc.a $(RABIT)/lib/$(LIB_RABIT)
|
||||
ALL_DEP = $(filter-out build/cli_main.o, $(ALL_OBJ)) $(LIB_DEP)
|
||||
CLI_OBJ = build/cli_main.o
|
||||
|
||||
build/%.o: src/%.cc
|
||||
@mkdir -p $(@D)
|
||||
$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
|
||||
$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
|
||||
$(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
|
||||
$(CXX) -c $(CFLAGS) -c $< -o $@
|
||||
|
||||
# The should be equivalent to $(ALL_OBJ) except for build/cli_main.o
|
||||
amalgamation/xgboost-all0.o: amalgamation/xgboost-all0.cc
|
||||
$(CXX) -c $(CFLAGS) -c $< -o $@
|
||||
|
||||
# Equivalent to lib/libxgboost_all.so
|
||||
lib/libxgboost_all.so: $(AMALGA_OBJ) $(LIB_DEP)
|
||||
@mkdir -p $(@D)
|
||||
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
|
||||
|
||||
lib/libxgboost.a: $(ALL_DEP)
|
||||
@mkdir -p $(@D)
|
||||
@ -84,14 +95,14 @@ lib/libxgboost.so: $(ALL_DEP)
|
||||
@mkdir -p $(@D)
|
||||
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
|
||||
|
||||
xgboost: lib/libxgboost.a $(CLI_OBJ) $(LIB_DEP)
|
||||
$(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
|
||||
xgboost: $(CLI_OBJ) lib/libxgboost.a $(LIB_DEP)
|
||||
$(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
|
||||
|
||||
lint:
|
||||
python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src
|
||||
|
||||
clean:
|
||||
$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~
|
||||
$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ $(AMALGA_OBJ)
|
||||
|
||||
clean_all: clean
|
||||
cd $(DMLC_CORE); make clean; cd -
|
||||
|
||||
51
amalgamation/xgboost-all0.cc
Normal file
51
amalgamation/xgboost-all0.cc
Normal file
@ -0,0 +1,51 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors.
|
||||
* \brief XGBoost Amalgamation.
|
||||
* This offers an alternative way to compile the entire library from this single file.
|
||||
*
|
||||
* Example usage command.
|
||||
* - $(CXX) -std=c++0x -fopenmp -o -shared libxgboost.so xgboost-all0.cc -ldmlc -lrabit
|
||||
*
|
||||
* \author Tianqi Chen.
|
||||
*/
|
||||
|
||||
// metrics
|
||||
#include "../src/metric/metric.cc"
|
||||
#include "../src/metric/elementwise_metric.cc"
|
||||
#include "../src/metric/multiclass_metric.cc"
|
||||
#include "../src/metric/rank_metric.cc"
|
||||
|
||||
// objectives
|
||||
#include "../src/objective/objective.cc"
|
||||
#include "../src/objective/regression_obj.cc"
|
||||
#include "../src/objective/multiclass_obj.cc"
|
||||
#include "../src/objective/rank_obj.cc"
|
||||
|
||||
// gbms
|
||||
#include "../src/gbm/gbm.cc"
|
||||
#include "../src/gbm/gbtree.cc"
|
||||
#include "../src/gbm/gblinear.cc"
|
||||
|
||||
// data
|
||||
#include "../src/data/data.cc"
|
||||
#include "../src/data/simple_csr_source.cc"
|
||||
#include "../src/data/simple_dmatrix.cc"
|
||||
|
||||
// tress
|
||||
#include "../src/tree/tree_model.cc"
|
||||
#include "../src/tree/tree_updater.cc"
|
||||
#include "../src/tree/updater_colmaker.cc"
|
||||
#include "../src/tree/updater_prune.cc"
|
||||
#include "../src/tree/updater_refresh.cc"
|
||||
#include "../src/tree/updater_sync.cc"
|
||||
#include "../src/tree/updater_histmaker.cc"
|
||||
#include "../src/tree/updater_skmaker.cc"
|
||||
|
||||
// global
|
||||
#include "../src/learner.cc"
|
||||
#include "../src/logging.cc"
|
||||
#include "../src/common/common.cc"
|
||||
|
||||
// c_api
|
||||
#include "../src/c_api/c_api.cc"
|
||||
#include "../src/c_api/c_api_error.cc"
|
||||
@ -1,4 +1,5 @@
|
||||
#!/bin/bash
|
||||
export PYTHONPATH=PYTHONPATH:../../python-package
|
||||
python basic_walkthrough.py
|
||||
python custom_objective.py
|
||||
python boost_from_prediction.py
|
||||
@ -9,4 +10,4 @@ python predict_leaf_indices.py
|
||||
python sklearn_examples.py
|
||||
python sklearn_parallel.py
|
||||
python external_memory.py
|
||||
rm -rf *~ *.model *.buffer
|
||||
rm -rf *~ *.model *.buffer
|
||||
|
||||
@ -1 +1 @@
|
||||
Subproject commit ec454218564fee8e531aee02b8943a4634330ce1
|
||||
Subproject commit c0325077a3ceda08fe04b2aa115e004a3520630a
|
||||
@ -16,6 +16,15 @@
|
||||
#define XGBOOST_STRICT_R_MODE 0
|
||||
#endif
|
||||
|
||||
/*!
|
||||
* \brief Whether always log console message with time.
|
||||
* It will display like, with timestamp appended to head of the message.
|
||||
* "[21:47:50] 6513x126 matrix with 143286 entries loaded from ../data/agaricus.txt.train"
|
||||
*/
|
||||
#ifndef XGBOOST_LOG_WITH_TIME
|
||||
#define XGBOOST_LOG_WITH_TIME 0
|
||||
#endif
|
||||
|
||||
/*! \brief namespace of xgboo st*/
|
||||
namespace xgboost {
|
||||
/*!
|
||||
@ -23,6 +32,8 @@ namespace xgboost {
|
||||
* used for feature index and row index.
|
||||
*/
|
||||
typedef uint32_t bst_uint;
|
||||
/*! \brief long integers */
|
||||
typedef unsigned long bst_ulong; // NOLINT(*)
|
||||
/*! \brief float type, used for storing statistics */
|
||||
typedef float bst_float;
|
||||
|
||||
|
||||
@ -36,13 +36,6 @@ typedef void *BoosterHandle;
|
||||
*/
|
||||
XGB_DLL const char *XGBGetLastError();
|
||||
|
||||
/*!
|
||||
* \brief Entry point of CLI program.
|
||||
* \param argc The number of arguments.
|
||||
* \param argv The command line arguments.
|
||||
*/
|
||||
XGB_DLL int XGBoostCLIMain(int argc, char* argv[])
|
||||
|
||||
/*!
|
||||
* \brief load a data matrix
|
||||
* \param fname the name of the file
|
||||
|
||||
@ -59,7 +59,7 @@ struct MetaInfo {
|
||||
/*! \brief version flag, used to check version of this info */
|
||||
static const int kVersion = 1;
|
||||
/*! \brief default constructor */
|
||||
MetaInfo() : num_row(0), num_col(0) {}
|
||||
MetaInfo() : num_row(0), num_col(0), num_nonzero(0) {}
|
||||
/*!
|
||||
* \brief Get weight of each instances.
|
||||
* \param i Instance index.
|
||||
@ -96,14 +96,6 @@ struct MetaInfo {
|
||||
* \param num Number of elements in the source array.
|
||||
*/
|
||||
void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num);
|
||||
/*!
|
||||
* \brief Get information from meta info.
|
||||
* \param key The key of the information.
|
||||
* \param dptr The output data pointer of the source array.
|
||||
* \param dtype The output data type of the information array.
|
||||
* \param num Number of elements in the array.
|
||||
*/
|
||||
void GetInfo(const char* key, const void** dptr, DataType* dtype, size_t* num) const;
|
||||
};
|
||||
|
||||
/*! \brief read-only sparse instance batch in CSR format */
|
||||
@ -259,11 +251,14 @@ class DMatrix {
|
||||
* \param uri The URI of input.
|
||||
* \param silent Whether print information during loading.
|
||||
* \param load_row_split Flag to read in part of rows, divided among the workers in distributed mode.
|
||||
* \param file_format The format type of the file, used for dmlc::Parser::Create.
|
||||
* By default "auto" will be able to load in both local binary file.
|
||||
* \return The created DMatrix.
|
||||
*/
|
||||
static DMatrix* Load(const std::string& uri,
|
||||
bool silent,
|
||||
bool load_row_split);
|
||||
bool load_row_split,
|
||||
const std::string& file_format = "auto");
|
||||
/*!
|
||||
* \brief create a new DMatrix, by wrapping a row_iterator, and meta info.
|
||||
* \param source The source iterator of the data, the create function takes ownership of the source.
|
||||
@ -273,7 +268,7 @@ class DMatrix {
|
||||
* \return a Created DMatrix.
|
||||
*/
|
||||
static DMatrix* Create(std::unique_ptr<DataSource>&& source,
|
||||
const char* cache_prefix = nullptr);
|
||||
const std::string& cache_prefix = "");
|
||||
/*!
|
||||
* \brief Create a DMatrix by loaidng data from parser.
|
||||
* Parser can later be deleted after the DMatrix i created.
|
||||
@ -287,7 +282,7 @@ class DMatrix {
|
||||
* \return A created DMatrix.
|
||||
*/
|
||||
static DMatrix* Create(dmlc::Parser<uint32_t>* parser,
|
||||
const char* cache_prefix = nullptr);
|
||||
const std::string& cache_prefix = "");
|
||||
|
||||
private:
|
||||
// allow learner class to access this field.
|
||||
|
||||
@ -163,7 +163,7 @@ struct GradientBoosterReg
|
||||
*/
|
||||
#define XGBOOST_REGISTER_GBM(UniqueId, Name) \
|
||||
static ::xgboost::GradientBoosterReg & __make_ ## GradientBoosterReg ## _ ## UniqueId ## __ = \
|
||||
::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->__REGISTER__(#Name)
|
||||
::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->__REGISTER__(Name)
|
||||
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_GBM_H_
|
||||
|
||||
@ -36,6 +36,8 @@ namespace xgboost {
|
||||
*/
|
||||
class Learner : public rabit::Serializable {
|
||||
public:
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~Learner() {}
|
||||
/*!
|
||||
* \brief set configuration from pair iterators.
|
||||
* \param begin The beginning iterator.
|
||||
@ -51,6 +53,11 @@ class Learner : public rabit::Serializable {
|
||||
* \param cfg configurations on both training and model parameters.
|
||||
*/
|
||||
virtual void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) = 0;
|
||||
/*!
|
||||
* \brief Initialize the model using the specified configurations via Configure.
|
||||
* An model have to be either Loaded or initialized before Update/Predict/Save can be called.
|
||||
*/
|
||||
virtual void InitModel() = 0;
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream.
|
||||
|
||||
50
include/xgboost/logging.h
Normal file
50
include/xgboost/logging.h
Normal file
@ -0,0 +1,50 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file logging.h
|
||||
* \brief defines console logging options for xgboost.
|
||||
* Use to enforce unified print behavior.
|
||||
* For debug loggers, use LOG(INFO) and LOG(ERROR).
|
||||
*/
|
||||
#ifndef XGBOOST_LOGGING_H_
|
||||
#define XGBOOST_LOGGING_H_
|
||||
|
||||
#include <dmlc/logging.h>
|
||||
#include <sstream>
|
||||
#include "./base.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
class BaseLogger {
|
||||
public:
|
||||
BaseLogger() {
|
||||
#if XGBOOST_LOG_WITH_TIME
|
||||
log_stream_ << "[" << dmlc::DateLogger().HumanDate() << "] ";
|
||||
#endif
|
||||
}
|
||||
std::ostream& stream() { return log_stream_; }
|
||||
|
||||
protected:
|
||||
std::ostringstream log_stream_;
|
||||
};
|
||||
|
||||
class ConsoleLogger : public BaseLogger {
|
||||
public:
|
||||
~ConsoleLogger();
|
||||
};
|
||||
|
||||
class TrackerLogger : public BaseLogger {
|
||||
public:
|
||||
~TrackerLogger();
|
||||
};
|
||||
|
||||
// redefines the logging macro if not existed
|
||||
#ifndef LOG
|
||||
#define LOG(severity) LOG_##severity.stream()
|
||||
#endif
|
||||
|
||||
// Enable LOG(CONSOLE) for print messages to console.
|
||||
#define LOG_CONSOLE ::xgboost::ConsoleLogger()
|
||||
// Enable LOG(TRACKER) for print messages to tracker
|
||||
#define LOG_TRACKER ::xgboost::TrackerLogger()
|
||||
} // namespace xgboost.
|
||||
#endif // XGBOOST_LOGGING_H_
|
||||
@ -70,7 +70,7 @@ struct MetricReg
|
||||
* \endcode
|
||||
*/
|
||||
#define XGBOOST_REGISTER_METRIC(UniqueId, Name) \
|
||||
static ::xgboost::MetricReg & __make_ ## MetricReg ## _ ## UniqueId ## __ = \
|
||||
::dmlc::Registry< ::xgboost::MetricReg>::Get()->__REGISTER__(#Name)
|
||||
::xgboost::MetricReg& __make_ ## MetricReg ## _ ## UniqueId ## __ = \
|
||||
::dmlc::Registry< ::xgboost::MetricReg>::Get()->__REGISTER__(Name)
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_METRIC_H_
|
||||
|
||||
@ -106,6 +106,6 @@ struct ObjFunctionReg
|
||||
*/
|
||||
#define XGBOOST_REGISTER_OBJECTIVE(UniqueId, Name) \
|
||||
static ::xgboost::ObjFunctionReg & __make_ ## ObjFunctionReg ## _ ## UniqueId ## __ = \
|
||||
::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->__REGISTER__(#Name)
|
||||
::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->__REGISTER__(Name)
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_OBJECTIVE_H_
|
||||
|
||||
@ -8,7 +8,6 @@
|
||||
#define XGBOOST_TREE_MODEL_H_
|
||||
|
||||
#include <dmlc/io.h>
|
||||
#include <dmlc/logging.h>
|
||||
#include <dmlc/parameter.h>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
@ -17,6 +16,7 @@
|
||||
#include <algorithm>
|
||||
#include "./base.h"
|
||||
#include "./data.h"
|
||||
#include "./logging.h"
|
||||
#include "./feature_map.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@ -79,7 +79,7 @@ struct TreeUpdaterReg
|
||||
*/
|
||||
#define XGBOOST_REGISTER_TREE_UPDATER(UniqueId, Name) \
|
||||
static ::xgboost::TreeUpdaterReg& __make_ ## TreeUpdaterReg ## _ ## UniqueId ## __ = \
|
||||
::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->__REGISTER__(#Name)
|
||||
::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->__REGISTER__(Name)
|
||||
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_TREE_UPDATER_H_
|
||||
|
||||
@ -1,229 +0,0 @@
|
||||
// Copyright by Contributors
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#define NOMINMAX
|
||||
#include <string>
|
||||
#include "../utils/io.h"
|
||||
|
||||
// implements a single no split version of DMLC
|
||||
// in case we want to avoid dependency on dmlc-core
|
||||
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*!
|
||||
* \brief line split implementation from single FILE
|
||||
* simply returns lines of files, used for stdin
|
||||
*/
|
||||
class SingleFileSplit : public dmlc::InputSplit {
|
||||
public:
|
||||
explicit SingleFileSplit(const char *fname)
|
||||
: use_stdin_(false),
|
||||
chunk_begin_(NULL), chunk_end_(NULL) {
|
||||
if (!std::strcmp(fname, "stdin")) {
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
use_stdin_ = true; fp_ = stdin;
|
||||
#endif
|
||||
}
|
||||
if (!use_stdin_) {
|
||||
fp_ = utils::FopenCheck(fname, "rb");
|
||||
}
|
||||
buffer_.resize(kBufferSize);
|
||||
}
|
||||
virtual ~SingleFileSplit(void) {
|
||||
if (!use_stdin_) std::fclose(fp_);
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
return std::fread(ptr, 1, size, fp_);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
utils::Error("cannot do write in inputsplit");
|
||||
}
|
||||
virtual void BeforeFirst(void) {
|
||||
std::fseek(fp_, 0, SEEK_SET);
|
||||
}
|
||||
virtual bool NextRecord(Blob *out_rec) {
|
||||
if (chunk_begin_ == chunk_end_) {
|
||||
if (!LoadChunk()) return false;
|
||||
}
|
||||
char *next = FindNextRecord(chunk_begin_,
|
||||
chunk_end_);
|
||||
out_rec->dptr = chunk_begin_;
|
||||
out_rec->size = next - chunk_begin_;
|
||||
chunk_begin_ = next;
|
||||
return true;
|
||||
}
|
||||
virtual bool NextChunk(Blob *out_chunk) {
|
||||
if (chunk_begin_ == chunk_end_) {
|
||||
if (!LoadChunk()) return false;
|
||||
}
|
||||
out_chunk->dptr = chunk_begin_;
|
||||
out_chunk->size = chunk_end_ - chunk_begin_;
|
||||
chunk_begin_ = chunk_end_;
|
||||
return true;
|
||||
}
|
||||
inline bool ReadChunk(void *buf, size_t *size) {
|
||||
size_t max_size = *size;
|
||||
if (max_size <= overflow_.length()) {
|
||||
*size = 0; return true;
|
||||
}
|
||||
if (overflow_.length() != 0) {
|
||||
std::memcpy(buf, BeginPtr(overflow_), overflow_.length());
|
||||
}
|
||||
size_t olen = overflow_.length();
|
||||
overflow_.resize(0);
|
||||
size_t nread = this->Read(reinterpret_cast<char*>(buf) + olen,
|
||||
max_size - olen);
|
||||
nread += olen;
|
||||
if (nread == 0) return false;
|
||||
if (nread != max_size) {
|
||||
*size = nread;
|
||||
return true;
|
||||
} else {
|
||||
const char *bptr = reinterpret_cast<const char*>(buf);
|
||||
// return the last position where a record starts
|
||||
const char *bend = this->FindLastRecordBegin(bptr, bptr + max_size);
|
||||
*size = bend - bptr;
|
||||
overflow_.resize(max_size - *size);
|
||||
if (overflow_.length() != 0) {
|
||||
std::memcpy(BeginPtr(overflow_), bend, overflow_.length());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
inline const char* FindLastRecordBegin(const char *begin,
|
||||
const char *end) {
|
||||
if (begin == end) return begin;
|
||||
for (const char *p = end - 1; p != begin; --p) {
|
||||
if (*p == '\n' || *p == '\r') return p + 1;
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
inline char* FindNextRecord(char *begin, char *end) {
|
||||
char *p;
|
||||
for (p = begin; p != end; ++p) {
|
||||
if (*p == '\n' || *p == '\r') break;
|
||||
}
|
||||
for (; p != end; ++p) {
|
||||
if (*p != '\n' && *p != '\r') return p;
|
||||
}
|
||||
return end;
|
||||
}
|
||||
inline bool LoadChunk(void) {
|
||||
while (true) {
|
||||
size_t size = buffer_.length();
|
||||
if (!ReadChunk(BeginPtr(buffer_), &size)) return false;
|
||||
if (size == 0) {
|
||||
buffer_.resize(buffer_.length() * 2);
|
||||
} else {
|
||||
chunk_begin_ = reinterpret_cast<char *>(BeginPtr(buffer_));
|
||||
chunk_end_ = chunk_begin_ + size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
// buffer size
|
||||
static const size_t kBufferSize = 1 << 18UL;
|
||||
// file
|
||||
std::FILE *fp_;
|
||||
bool use_stdin_;
|
||||
// internal overflow
|
||||
std::string overflow_;
|
||||
// internal buffer
|
||||
std::string buffer_;
|
||||
// beginning of chunk
|
||||
char *chunk_begin_;
|
||||
// end of chunk
|
||||
char *chunk_end_;
|
||||
};
|
||||
|
||||
class StdFile : public dmlc::Stream {
|
||||
public:
|
||||
explicit StdFile(std::FILE *fp, bool use_stdio)
|
||||
: fp(fp), use_stdio(use_stdio) {
|
||||
}
|
||||
virtual ~StdFile(void) {
|
||||
this->Close();
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
return std::fread(ptr, 1, size, fp);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
Check(std::fwrite(ptr, size, 1, fp) == 1, "StdFile::Write: fwrite error!");
|
||||
}
|
||||
virtual void Seek(size_t pos) {
|
||||
std::fseek(fp, static_cast<long>(pos), SEEK_SET); // NOLINT(*)
|
||||
}
|
||||
virtual size_t Tell(void) {
|
||||
return std::ftell(fp);
|
||||
}
|
||||
virtual bool AtEnd(void) const {
|
||||
return std::feof(fp) != 0;
|
||||
}
|
||||
inline void Close(void) {
|
||||
if (fp != NULL && !use_stdio) {
|
||||
std::fclose(fp); fp = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::FILE *fp;
|
||||
bool use_stdio;
|
||||
};
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
|
||||
namespace dmlc {
|
||||
InputSplit* InputSplit::Create(const char *uri,
|
||||
unsigned part,
|
||||
unsigned nsplit,
|
||||
const char *type) {
|
||||
using namespace std;
|
||||
using namespace xgboost;
|
||||
const char *msg = "xgboost is compiled in local mode\n"\
|
||||
"to use hdfs, s3 or distributed version, compile with make dmlc=1";
|
||||
utils::Check(strncmp(uri, "s3://", 5) != 0, msg);
|
||||
utils::Check(strncmp(uri, "hdfs://", 7) != 0, msg);
|
||||
utils::Check(nsplit == 1, msg);
|
||||
return new utils::SingleFileSplit(uri);
|
||||
}
|
||||
|
||||
Stream *Stream::Create(const char *fname, const char * const mode, bool allow_null) {
|
||||
using namespace std;
|
||||
using namespace xgboost;
|
||||
const char *msg = "xgboost is compiled in local mode\n"\
|
||||
"to use hdfs, s3 or distributed version, compile with make dmlc=1";
|
||||
utils::Check(strncmp(fname, "s3://", 5) != 0, msg);
|
||||
utils::Check(strncmp(fname, "hdfs://", 7) != 0, msg);
|
||||
|
||||
std::FILE *fp = NULL;
|
||||
bool use_stdio = false;
|
||||
using namespace std;
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
if (!strcmp(fname, "stdin")) {
|
||||
use_stdio = true; fp = stdin;
|
||||
}
|
||||
if (!strcmp(fname, "stdout")) {
|
||||
use_stdio = true; fp = stdout;
|
||||
}
|
||||
#endif
|
||||
if (!strncmp(fname, "file://", 7)) fname += 7;
|
||||
if (!use_stdio) {
|
||||
std::string flag = mode;
|
||||
if (flag == "w") flag = "wb";
|
||||
if (flag == "r") flag = "rb";
|
||||
fp = fopen64(fname, flag.c_str());
|
||||
}
|
||||
if (fp != NULL) {
|
||||
return new utils::StdFile(fp, use_stdio);
|
||||
} else {
|
||||
utils::Check(allow_null, "fail to open file %s", fname);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
} // namespace dmlc
|
||||
|
||||
@ -1,212 +0,0 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file libsvm_parser.h
|
||||
* \brief iterator parser to parse libsvm format
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_LIBSVM_PARSER_H_
|
||||
#define XGBOOST_IO_LIBSVM_PARSER_H_
|
||||
#define NOMINMAX
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include <cctype>
|
||||
#include <algorithm>
|
||||
#include "../utils/omp.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "../sync/sync.h"
|
||||
#include "../utils/thread_buffer.h"
|
||||
#include "./sparse_batch_page.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*! \brief page returned by libsvm parser */
|
||||
struct LibSVMPage : public SparsePage {
|
||||
std::vector<float> label;
|
||||
// overload clear
|
||||
inline void Clear() {
|
||||
SparsePage::Clear();
|
||||
label.clear();
|
||||
}
|
||||
};
|
||||
/*!
|
||||
* \brief libsvm parser that parses the input lines
|
||||
* and returns rows in input data
|
||||
* factory that was used by threadbuffer template
|
||||
*/
|
||||
class LibSVMPageFactory {
|
||||
public:
|
||||
LibSVMPageFactory()
|
||||
: bytes_read_(0), at_head_(true) {
|
||||
}
|
||||
inline bool Init(void) {
|
||||
return true;
|
||||
}
|
||||
inline void Setup(dmlc::InputSplit *source,
|
||||
int nthread) {
|
||||
source_ = source;
|
||||
int maxthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
maxthread = omp_get_num_procs();
|
||||
}
|
||||
maxthread = std::max(maxthread / 2, 1);
|
||||
nthread_ = std::min(maxthread, nthread);
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {}
|
||||
inline bool LoadNext(std::vector<LibSVMPage> *data) {
|
||||
return FillData(data);
|
||||
}
|
||||
inline void FreeSpace(std::vector<LibSVMPage> *a) {
|
||||
delete a;
|
||||
}
|
||||
inline std::vector<LibSVMPage> *Create(void) {
|
||||
return new std::vector<LibSVMPage>();
|
||||
}
|
||||
inline void BeforeFirst(void) {
|
||||
utils::Assert(at_head_, "cannot call beforefirst");
|
||||
}
|
||||
inline void Destroy(void) {
|
||||
delete source_;
|
||||
}
|
||||
inline size_t bytes_read(void) const {
|
||||
return bytes_read_;
|
||||
}
|
||||
|
||||
protected:
|
||||
inline bool FillData(std::vector<LibSVMPage> *data) {
|
||||
dmlc::InputSplit::Blob chunk;
|
||||
if (!source_->NextChunk(&chunk)) return false;
|
||||
int nthread;
|
||||
#pragma omp parallel num_threads(nthread_)
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
// reserve space for data
|
||||
data->resize(nthread);
|
||||
bytes_read_ += chunk.size;
|
||||
utils::Assert(chunk.size != 0, "LibSVMParser.FileData");
|
||||
char *head = reinterpret_cast<char*>(chunk.dptr);
|
||||
#pragma omp parallel num_threads(nthread_)
|
||||
{
|
||||
// threadid
|
||||
int tid = omp_get_thread_num();
|
||||
size_t nstep = (chunk.size + nthread - 1) / nthread;
|
||||
size_t sbegin = std::min(tid * nstep, chunk.size);
|
||||
size_t send = std::min((tid + 1) * nstep, chunk.size);
|
||||
char *pbegin = BackFindEndLine(head + sbegin, head);
|
||||
char *pend;
|
||||
if (tid + 1 == nthread) {
|
||||
pend = head + send;
|
||||
} else {
|
||||
pend = BackFindEndLine(head + send, head);
|
||||
}
|
||||
ParseBlock(pbegin, pend, &(*data)[tid]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief parse data into out
|
||||
* \param begin beginning of buffer
|
||||
* \param end end of buffer
|
||||
*/
|
||||
inline void ParseBlock(char *begin,
|
||||
char *end,
|
||||
LibSVMPage *out) {
|
||||
using namespace std;
|
||||
out->Clear();
|
||||
char *p = begin;
|
||||
while (p != end) {
|
||||
while (isspace(*p) && p != end) ++p;
|
||||
if (p == end) break;
|
||||
char *head = p;
|
||||
while (isdigit(*p) && p != end) ++p;
|
||||
if (*p == ':') {
|
||||
out->data.push_back(SparseBatch::Entry(atol(head),
|
||||
static_cast<bst_float>(atof(p + 1))));
|
||||
} else {
|
||||
if (out->label.size() != 0) {
|
||||
out->offset.push_back(out->data.size());
|
||||
}
|
||||
out->label.push_back(static_cast<float>(atof(head)));
|
||||
}
|
||||
while (!isspace(*p) && p != end) ++p;
|
||||
}
|
||||
if (out->label.size() != 0) {
|
||||
out->offset.push_back(out->data.size());
|
||||
}
|
||||
utils::Check(out->label.size() + 1 == out->offset.size(),
|
||||
"LibSVMParser inconsistent");
|
||||
}
|
||||
/*!
|
||||
* \brief start from bptr, go backward and find first endof line
|
||||
* \param bptr end position to go backward
|
||||
* \param begin the beginning position of buffer
|
||||
* \return position of first endof line going backward
|
||||
*/
|
||||
inline char* BackFindEndLine(char *bptr,
|
||||
char *begin) {
|
||||
for (; bptr != begin; --bptr) {
|
||||
if (*bptr == '\n' || *bptr == '\r') return bptr;
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
|
||||
private:
|
||||
// nthread
|
||||
int nthread_;
|
||||
// number of bytes readed
|
||||
size_t bytes_read_;
|
||||
// at beginning, at end of stream
|
||||
bool at_head_;
|
||||
// source split that provides the data
|
||||
dmlc::InputSplit *source_;
|
||||
};
|
||||
|
||||
class LibSVMParser : public utils::IIterator<LibSVMPage> {
|
||||
public:
|
||||
explicit LibSVMParser(dmlc::InputSplit *source,
|
||||
int nthread)
|
||||
: at_end_(false), data_ptr_(0), data_(NULL) {
|
||||
itr.SetParam("buffer_size", "2");
|
||||
itr.get_factory().Setup(source, nthread);
|
||||
itr.Init();
|
||||
}
|
||||
virtual void BeforeFirst(void) {
|
||||
itr.BeforeFirst();
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if (at_end_) return false;
|
||||
while (true) {
|
||||
if (data_ == NULL || data_ptr_ >= data_->size()) {
|
||||
if (!itr.Next(data_)) {
|
||||
at_end_ = true; return false;
|
||||
} else {
|
||||
data_ptr_ = 0;
|
||||
}
|
||||
}
|
||||
while (data_ptr_ < data_->size()) {
|
||||
data_ptr_ += 1;
|
||||
if ((*data_)[data_ptr_ - 1].Size() != 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
virtual const LibSVMPage &Value(void) const {
|
||||
return (*data_)[data_ptr_ - 1];
|
||||
}
|
||||
inline size_t bytes_read(void) const {
|
||||
return itr.get_factory().bytes_read();
|
||||
}
|
||||
|
||||
private:
|
||||
bool at_end_;
|
||||
size_t data_ptr_;
|
||||
std::vector<LibSVMPage> *data_;
|
||||
utils::ThreadBuffer<std::vector<LibSVMPage>*, LibSVMPageFactory> itr;
|
||||
};
|
||||
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_LIBSVM_PARSER_H_
|
||||
@ -1,374 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2014 by Contributors
|
||||
* \file simple_fmatrix-inl.hpp
|
||||
* \brief the input data structure for gradient boosting
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
|
||||
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
|
||||
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "../data.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "../utils/random.h"
|
||||
#include "../utils/omp.h"
|
||||
#include "../learner/dmatrix.h"
|
||||
#include "../utils/group_data.h"
|
||||
#include "./sparse_batch_page.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*!
|
||||
* \brief sparse matrix that support column access, CSC
|
||||
*/
|
||||
class FMatrixS : public IFMatrix {
|
||||
public:
|
||||
typedef SparseBatch::Entry Entry;
|
||||
/*! \brief constructor */
|
||||
FMatrixS(utils::IIterator<RowBatch> *iter,
|
||||
const learner::MetaInfo &info)
|
||||
: info_(info) {
|
||||
this->iter_ = iter;
|
||||
}
|
||||
// destructor
|
||||
virtual ~FMatrixS(void) {
|
||||
if (iter_ != NULL) delete iter_;
|
||||
}
|
||||
/*! \return whether column access is enabled */
|
||||
virtual bool HaveColAccess(void) const {
|
||||
return col_size_.size() != 0;
|
||||
}
|
||||
/*! \brief get number of columns */
|
||||
virtual size_t NumCol(void) const {
|
||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||
return col_size_.size();
|
||||
}
|
||||
/*! \brief get number of buffered rows */
|
||||
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
/*! \brief get column size */
|
||||
virtual size_t GetColSize(size_t cidx) const {
|
||||
return col_size_[cidx];
|
||||
}
|
||||
/*! \brief get column density */
|
||||
virtual float GetColDensity(size_t cidx) const {
|
||||
size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
||||
}
|
||||
virtual void InitColAccess(const std::vector<bool> &enabled,
|
||||
float pkeep, size_t max_row_perbatch) {
|
||||
if (this->HaveColAccess()) return;
|
||||
this->InitColData(enabled, pkeep, max_row_perbatch);
|
||||
}
|
||||
/*!
|
||||
* \brief get the row iterator associated with FMatrix
|
||||
*/
|
||||
virtual utils::IIterator<RowBatch>* RowIterator(void) {
|
||||
iter_->BeforeFirst();
|
||||
return iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief get the column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch>* ColIterator(void) {
|
||||
size_t ncol = this->NumCol();
|
||||
col_iter_.col_index_.resize(ncol);
|
||||
for (size_t i = 0; i < ncol; ++i) {
|
||||
col_iter_.col_index_[i] = static_cast<bst_uint>(i);
|
||||
}
|
||||
col_iter_.BeforeFirst();
|
||||
return &col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
|
||||
size_t ncol = this->NumCol();
|
||||
col_iter_.col_index_.resize(0);
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
|
||||
}
|
||||
col_iter_.BeforeFirst();
|
||||
return &col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief save column access data into stream
|
||||
* \param fo output stream to save to
|
||||
*/
|
||||
inline void SaveColAccess(utils::IStream &fo) const { // NOLINT(*)
|
||||
size_t n = 0;
|
||||
fo.Write(&n, sizeof(n));
|
||||
}
|
||||
/*!
|
||||
* \brief load column access data from stream
|
||||
* \param fo output stream to load from
|
||||
*/
|
||||
inline void LoadColAccess(utils::IStream &fi) { // NOLINT(*)
|
||||
// do nothing in load col access
|
||||
}
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief initialize column data
|
||||
* \param enabled the list of enabled columns
|
||||
* \param pkeep probability to keep a row
|
||||
* \param max_row_perbatch maximum row per batch
|
||||
*/
|
||||
inline void InitColData(const std::vector<bool> &enabled,
|
||||
float pkeep, size_t max_row_perbatch) {
|
||||
col_iter_.Clear();
|
||||
if (info_.num_row() < max_row_perbatch) {
|
||||
SparsePage *page = new SparsePage();
|
||||
this->MakeOneBatch(enabled, pkeep, page);
|
||||
col_iter_.cpages_.push_back(page);
|
||||
} else {
|
||||
this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
|
||||
}
|
||||
// setup col-size
|
||||
col_size_.resize(info_.num_col());
|
||||
std::fill(col_size_.begin(), col_size_.end(), 0);
|
||||
for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
|
||||
SparsePage *pcol = col_iter_.cpages_[i];
|
||||
for (size_t j = 0; j < pcol->Size(); ++j) {
|
||||
col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief make column page from iterator
|
||||
* \param pkeep probability to keep a row
|
||||
* \param pcol the target column
|
||||
*/
|
||||
inline void MakeOneBatch(const std::vector<bool> &enabled,
|
||||
float pkeep,
|
||||
SparsePage *pcol) {
|
||||
// clear rowset
|
||||
buffered_rowset_.clear();
|
||||
// bit map
|
||||
int nthread;
|
||||
std::vector<bool> bmap;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
pcol->Clear();
|
||||
utils::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info_.num_col(), nthread);
|
||||
// start working
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
bmap.resize(bmap.size() + batch.size, true);
|
||||
long batch_size = static_cast<long>(batch.size); // NOLINT(*)
|
||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
} else {
|
||||
bmap[i] = false;
|
||||
}
|
||||
}
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
if (enabled[inst[j].index]) {
|
||||
builder.AddBudget(inst[j].index, tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
if (enabled[inst[j].index]) {
|
||||
builder.Push(inst[j].index,
|
||||
Entry((bst_uint)(batch.base_rowid+i),
|
||||
inst[j].fvalue), tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
utils::Assert(pcol->Size() == info_.num_col(),
|
||||
"inconsistent col data");
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
std::sort(BeginPtr(pcol->data) + pcol->offset[i],
|
||||
BeginPtr(pcol->data) + pcol->offset[i + 1],
|
||||
SparseBatch::Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void MakeManyBatch(const std::vector<bool> &enabled,
|
||||
float pkeep, size_t max_row_perbatch) {
|
||||
size_t btop = 0;
|
||||
buffered_rowset_.clear();
|
||||
// internal temp cache
|
||||
SparsePage tmp; tmp.Clear();
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
tmp.Push(batch[i]);
|
||||
}
|
||||
if (tmp.Size() >= max_row_perbatch) {
|
||||
SparsePage *page = new SparsePage();
|
||||
this->MakeColPage(tmp.GetRowBatch(0),
|
||||
BeginPtr(buffered_rowset_) + btop,
|
||||
enabled, page);
|
||||
col_iter_.cpages_.push_back(page);
|
||||
btop = buffered_rowset_.size();
|
||||
tmp.Clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (tmp.Size() != 0) {
|
||||
SparsePage *page = new SparsePage();
|
||||
this->MakeColPage(tmp.GetRowBatch(0),
|
||||
BeginPtr(buffered_rowset_) + btop,
|
||||
enabled, page);
|
||||
col_iter_.cpages_.push_back(page);
|
||||
}
|
||||
}
|
||||
// make column page from subset of rowbatchs
|
||||
inline void MakeColPage(const RowBatch &batch,
|
||||
const bst_uint *ridx,
|
||||
const std::vector<bool> &enabled,
|
||||
SparsePage *pcol) {
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
|
||||
if (nthread > max_nthread) {
|
||||
nthread = max_nthread;
|
||||
}
|
||||
}
|
||||
pcol->Clear();
|
||||
utils::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info_.num_col(), nthread);
|
||||
bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
int tid = omp_get_thread_num();
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
const SparseBatch::Entry &e = inst[j];
|
||||
if (enabled[e.index]) {
|
||||
builder.AddBudget(e.index, tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
int tid = omp_get_thread_num();
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
const SparseBatch::Entry &e = inst[j];
|
||||
builder.Push(e.index,
|
||||
SparseBatch::Entry(ridx[i], e.fvalue),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
std::sort(BeginPtr(pcol->data) + pcol->offset[i],
|
||||
BeginPtr(pcol->data) + pcol->offset[i + 1],
|
||||
SparseBatch::Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// one batch iterator that return content in the matrix
|
||||
struct ColBatchIter: utils::IIterator<ColBatch> {
|
||||
ColBatchIter(void) : data_ptr_(0) {}
|
||||
virtual ~ColBatchIter(void) {
|
||||
this->Clear();
|
||||
}
|
||||
virtual void BeforeFirst(void) {
|
||||
data_ptr_ = 0;
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if (data_ptr_ >= cpages_.size()) return false;
|
||||
data_ptr_ += 1;
|
||||
SparsePage *pcol = cpages_[data_ptr_ - 1];
|
||||
batch_.size = col_index_.size();
|
||||
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
|
||||
for (size_t i = 0; i < col_data_.size(); ++i) {
|
||||
const bst_uint ridx = col_index_[i];
|
||||
col_data_[i] = SparseBatch::Inst
|
||||
(BeginPtr(pcol->data) + pcol->offset[ridx],
|
||||
static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
|
||||
}
|
||||
batch_.col_index = BeginPtr(col_index_);
|
||||
batch_.col_data = BeginPtr(col_data_);
|
||||
return true;
|
||||
}
|
||||
virtual const ColBatch &Value(void) const {
|
||||
return batch_;
|
||||
}
|
||||
inline void Clear(void) {
|
||||
for (size_t i = 0; i < cpages_.size(); ++i) {
|
||||
delete cpages_[i];
|
||||
}
|
||||
cpages_.clear();
|
||||
}
|
||||
// data content
|
||||
std::vector<bst_uint> col_index_;
|
||||
// column content
|
||||
std::vector<ColBatch::Inst> col_data_;
|
||||
// column sparse pages
|
||||
std::vector<SparsePage*> cpages_;
|
||||
// data pointer
|
||||
size_t data_ptr_;
|
||||
// temporal space for batch
|
||||
ColBatch batch_;
|
||||
};
|
||||
// --- data structure used to support InitColAccess --
|
||||
// column iterator
|
||||
ColBatchIter col_iter_;
|
||||
// shared meta info with DMatrix
|
||||
const learner::MetaInfo &info_;
|
||||
// row iterator
|
||||
utils::IIterator<RowBatch> *iter_;
|
||||
/*! \brief list of row index that are buffered */
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
// count for column data
|
||||
std::vector<size_t> col_size_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP_
|
||||
@ -1,176 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2014 by Contributors
|
||||
* \file dmatrix.h
|
||||
* \brief meta data and template data structure
|
||||
* used for regression/classification/ranking
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_LEARNER_DMATRIX_H_
|
||||
#define XGBOOST_LEARNER_DMATRIX_H_
|
||||
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include "../data.h"
|
||||
#include "../utils/io.h"
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*!
|
||||
* \brief meta information needed in training, including label, weight
|
||||
*/
|
||||
struct MetaInfo {
|
||||
/*!
|
||||
* \brief information needed by booster
|
||||
* BoosterInfo does not implement save and load,
|
||||
* all serialization is done in MetaInfo
|
||||
*/
|
||||
BoosterInfo info;
|
||||
/*! \brief label of each instance */
|
||||
std::vector<float> labels;
|
||||
/*!
|
||||
* \brief the index of begin and end of a group
|
||||
* needed when the learning task is ranking
|
||||
*/
|
||||
std::vector<bst_uint> group_ptr;
|
||||
/*! \brief weights of each instance, optional */
|
||||
std::vector<float> weights;
|
||||
/*!
|
||||
* \brief initialized margins,
|
||||
* if specified, xgboost will start from this initial margin
|
||||
* can be used to specify initial prediction to boost from
|
||||
*/
|
||||
std::vector<float> base_margin;
|
||||
/*! \brief version flag, used to check version of this info */
|
||||
static const int kVersion = 0;
|
||||
// constructor
|
||||
MetaInfo(void) {}
|
||||
/*! \return number of rows in dataset */
|
||||
inline size_t num_row(void) const {
|
||||
return info.num_row;
|
||||
}
|
||||
/*! \return number of columns in dataset */
|
||||
inline size_t num_col(void) const {
|
||||
return info.num_col;
|
||||
}
|
||||
/*! \brief clear all the information */
|
||||
inline void Clear(void) {
|
||||
labels.clear();
|
||||
group_ptr.clear();
|
||||
weights.clear();
|
||||
info.root_index.clear();
|
||||
base_margin.clear();
|
||||
info.num_row = info.num_col = 0;
|
||||
}
|
||||
/*! \brief get weight of each instances */
|
||||
inline float GetWeight(size_t i) const {
|
||||
if (weights.size() != 0) {
|
||||
return weights[i];
|
||||
} else {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
inline void SaveBinary(utils::IStream &fo) const { // NOLINT(*)
|
||||
int version = kVersion;
|
||||
fo.Write(&version, sizeof(version));
|
||||
fo.Write(&info.num_row, sizeof(info.num_row));
|
||||
fo.Write(&info.num_col, sizeof(info.num_col));
|
||||
fo.Write(labels);
|
||||
fo.Write(group_ptr);
|
||||
fo.Write(weights);
|
||||
fo.Write(info.root_index);
|
||||
fo.Write(base_margin);
|
||||
}
|
||||
inline void LoadBinary(utils::IStream &fi) { // NOLINT(*)
|
||||
int version;
|
||||
utils::Check(fi.Read(&version, sizeof(version)) != 0, "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&info.num_row, sizeof(info.num_row)) != 0, "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&info.num_col, sizeof(info.num_col)) != 0, "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&labels), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&weights), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&info.root_index), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&base_margin), "MetaInfo: invalid format");
|
||||
}
|
||||
// try to load group information from file, if exists
|
||||
inline bool TryLoadGroup(const char* fname, bool silent = false) {
|
||||
using namespace std;
|
||||
FILE *fi = fopen64(fname, "r");
|
||||
if (fi == NULL) return false;
|
||||
group_ptr.push_back(0);
|
||||
unsigned nline;
|
||||
while (fscanf(fi, "%u", &nline) == 1) {
|
||||
group_ptr.push_back(group_ptr.back()+nline);
|
||||
}
|
||||
if (!silent) {
|
||||
utils::Printf("%u groups are loaded from %s\n",
|
||||
static_cast<unsigned>(group_ptr.size()-1), fname);
|
||||
}
|
||||
fclose(fi);
|
||||
return true;
|
||||
}
|
||||
inline std::vector<float>& GetFloatInfo(const char *field) {
|
||||
using namespace std;
|
||||
if (!strcmp(field, "label")) return labels;
|
||||
if (!strcmp(field, "weight")) return weights;
|
||||
if (!strcmp(field, "base_margin")) return base_margin;
|
||||
utils::Error("unknown field %s", field);
|
||||
return labels;
|
||||
}
|
||||
inline const std::vector<float>& GetFloatInfo(const char *field) const {
|
||||
return ((MetaInfo*)this)->GetFloatInfo(field); // NOLINT(*)
|
||||
}
|
||||
inline std::vector<unsigned> &GetUIntInfo(const char *field) {
|
||||
using namespace std;
|
||||
if (!strcmp(field, "root_index")) return info.root_index;
|
||||
if (!strcmp(field, "fold_index")) return info.fold_index;
|
||||
utils::Error("unknown field %s", field);
|
||||
return info.root_index;
|
||||
}
|
||||
inline const std::vector<unsigned> &GetUIntInfo(const char *field) const {
|
||||
return ((MetaInfo*)this)->GetUIntInfo(field); // NOLINT(*)
|
||||
}
|
||||
// try to load weight information from file, if exists
|
||||
inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) {
|
||||
using namespace std;
|
||||
std::vector<float> &data = this->GetFloatInfo(field);
|
||||
FILE *fi = fopen64(fname, "r");
|
||||
if (fi == NULL) return false;
|
||||
float wt;
|
||||
while (fscanf(fi, "%f", &wt) == 1) {
|
||||
data.push_back(wt);
|
||||
}
|
||||
if (!silent) {
|
||||
utils::Printf("loading %s from %s\n", field, fname);
|
||||
}
|
||||
fclose(fi);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief data object used for learning,
|
||||
* \tparam FMatrix type of feature data source
|
||||
*/
|
||||
struct DMatrix {
|
||||
/*!
|
||||
* \brief magic number associated with this object
|
||||
* used to check if it is specific instance
|
||||
*/
|
||||
const int magic;
|
||||
/*! \brief meta information about the dataset */
|
||||
MetaInfo info;
|
||||
/*!
|
||||
* \brief cache pointer to verify if the data structure is cached in some learner
|
||||
* used to verify if DMatrix is cached
|
||||
*/
|
||||
void *cache_learner_ptr_;
|
||||
/*! \brief default constructor */
|
||||
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
|
||||
/*! \brief get feature matrix about data content */
|
||||
virtual IFMatrix *fmat(void) const = 0;
|
||||
// virtual destructor
|
||||
virtual ~DMatrix(void){}
|
||||
};
|
||||
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_DMATRIX_H_
|
||||
@ -20,8 +20,8 @@ def find_lib_path():
|
||||
"""
|
||||
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
|
||||
# make pythonpack hack: copy this directory one level upper for setup.py
|
||||
dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/'),
|
||||
os.path.join(curr_path, './wrapper/')]
|
||||
dll_path = [curr_path, os.path.join(curr_path, '../../lib/'),
|
||||
os.path.join(curr_path, './lib/')]
|
||||
if os.name == 'nt':
|
||||
if platform.architecture()[0] == '64bit':
|
||||
dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/'))
|
||||
@ -32,9 +32,9 @@ def find_lib_path():
|
||||
# hack for pip installation when copy all parent source directory here
|
||||
dll_path.append(os.path.join(curr_path, './windows/Release/'))
|
||||
if os.name == 'nt':
|
||||
dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
|
||||
dll_path = [os.path.join(p, 'libxgboost.dll') for p in dll_path]
|
||||
else:
|
||||
dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
|
||||
dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path]
|
||||
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
|
||||
#From github issues, most of installation errors come from machines w/o compilers
|
||||
if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
|
||||
|
||||
2
rabit
2
rabit
@ -1 +1 @@
|
||||
Subproject commit bed63208af736c4aa289b629fbe5396bd9f513d9
|
||||
Subproject commit 05b958c178b16d707ff16b4b05506be124087e13
|
||||
528
src/c_api/c_api.cc
Normal file
528
src/c_api/c_api.cc
Normal file
@ -0,0 +1,528 @@
|
||||
// Copyright (c) 2014 by Contributors
|
||||
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/learner.h>
|
||||
#include <xgboost/c_api.h>
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
#include "./c_api_error.h"
|
||||
#include "../data/simple_csr_source.h"
|
||||
#include "../common/thread_local.h"
|
||||
#include "../common/math.h"
|
||||
#include "../common/io.h"
|
||||
#include "../common/group_data.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
// booster wrapper for backward compatible reason.
|
||||
class Booster {
|
||||
public:
|
||||
explicit Booster(const std::vector<DMatrix*>& cache_mats)
|
||||
: configured_(false),
|
||||
initialized_(false),
|
||||
learner_(Learner::Create(cache_mats)) {}
|
||||
|
||||
inline Learner* learner() {
|
||||
return learner_.get();
|
||||
}
|
||||
|
||||
inline void SetParam(const std::string& name, const std::string& val) {
|
||||
cfg_.push_back(std::make_pair(name, val));
|
||||
if (configured_) {
|
||||
learner_->Configure(cfg_);
|
||||
}
|
||||
}
|
||||
|
||||
inline void LazyInit() {
|
||||
if (!configured_) {
|
||||
learner_->Configure(cfg_);
|
||||
configured_ = true;
|
||||
}
|
||||
if (!initialized_) {
|
||||
learner_->InitModel();
|
||||
initialized_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
inline void LoadModel(dmlc::Stream* fi) {
|
||||
learner_->Load(fi);
|
||||
initialized_ = true;
|
||||
}
|
||||
|
||||
public:
|
||||
bool configured_;
|
||||
bool initialized_;
|
||||
std::unique_ptr<Learner> learner_;
|
||||
std::vector<std::pair<std::string, std::string> > cfg_;
|
||||
};
|
||||
} // namespace xgboost
|
||||
|
||||
using namespace xgboost; // NOLINT(*);
|
||||
|
||||
/*! \brief entry to to easily hold returning information */
|
||||
struct XGBAPIThreadLocalEntry {
|
||||
/*! \brief result holder for returning string */
|
||||
std::string ret_str;
|
||||
/*! \brief result holder for returning strings */
|
||||
std::vector<std::string> ret_vec_str;
|
||||
/*! \brief result holder for returning string pointers */
|
||||
std::vector<const char *> ret_vec_charp;
|
||||
/*! \brief returning float vector. */
|
||||
std::vector<float> ret_vec_float;
|
||||
/*! \brief temp variable of gradient pairs. */
|
||||
std::vector<bst_gpair> tmp_gpair;
|
||||
};
|
||||
|
||||
// define the threadlocal store.
|
||||
typedef xgboost::common::ThreadLocalStore<XGBAPIThreadLocalEntry> XGBAPIThreadLocalStore;
|
||||
|
||||
int XGDMatrixCreateFromFile(const char *fname,
|
||||
int silent,
|
||||
DMatrixHandle *out) {
|
||||
API_BEGIN();
|
||||
*out = DMatrix::Load(
|
||||
fname, silent != 0, false);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixCreateFromCSR(const bst_ulong* indptr,
|
||||
const unsigned *indices,
|
||||
const float* data,
|
||||
bst_ulong nindptr,
|
||||
bst_ulong nelem,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
mat.row_ptr_.resize(nindptr);
|
||||
for (bst_ulong i = 0; i < nindptr; ++i) {
|
||||
mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
|
||||
}
|
||||
mat.row_data_.resize(nelem);
|
||||
for (bst_ulong i = 0; i < nelem; ++i) {
|
||||
mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
|
||||
mat.info.num_col = std::max(mat.info.num_col,
|
||||
static_cast<size_t>(indices[i] + 1));
|
||||
}
|
||||
mat.info.num_row = nindptr - 1;
|
||||
mat.info.num_nonzero = static_cast<uint64_t>(nelem);
|
||||
*out = DMatrix::Create(std::move(source));
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixCreateFromCSC(const bst_ulong* col_ptr,
|
||||
const unsigned* indices,
|
||||
const float* data,
|
||||
bst_ulong nindptr,
|
||||
bst_ulong nelem,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
common::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
|
||||
builder.InitBudget(0, nthread);
|
||||
long ncol = static_cast<long>(nindptr - 1); // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.AddBudget(indices[j], tid);
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.Push(indices[j],
|
||||
RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
mat.info.num_row = mat.row_ptr_.size() - 1;
|
||||
mat.info.num_col = static_cast<uint64_t>(ncol);
|
||||
mat.info.num_nonzero = nelem;
|
||||
*out = DMatrix::Create(std::move(source));
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixCreateFromMat(const float* data,
|
||||
bst_ulong nrow,
|
||||
bst_ulong ncol,
|
||||
float missing,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
data::SimpleCSRSource& mat = *source;
|
||||
bool nan_missing = common::CheckNAN(missing);
|
||||
mat.info.num_row = nrow;
|
||||
mat.info.num_col = ncol;
|
||||
for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
|
||||
bst_ulong nelem = 0;
|
||||
for (bst_ulong j = 0; j < ncol; ++j) {
|
||||
if (common::CheckNAN(data[j])) {
|
||||
CHECK(nan_missing)
|
||||
<< "There are NAN in the matrix, however, you did not set missing=NAN";
|
||||
} else {
|
||||
if (nan_missing || data[j] != missing) {
|
||||
mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
|
||||
++nelem;
|
||||
}
|
||||
}
|
||||
}
|
||||
mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem);
|
||||
}
|
||||
mat.info.num_nonzero = mat.row_data_.size();
|
||||
*out = DMatrix::Create(std::move(source));
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixSliceDMatrix(DMatrixHandle handle,
|
||||
const int* idxset,
|
||||
bst_ulong len,
|
||||
DMatrixHandle* out) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
|
||||
API_BEGIN();
|
||||
data::SimpleCSRSource src;
|
||||
src.CopyFrom(static_cast<DMatrix*>(handle));
|
||||
data::SimpleCSRSource& ret = *source;
|
||||
|
||||
CHECK_EQ(src.info.group_ptr.size(), 0)
|
||||
<< "slice does not support group structure";
|
||||
|
||||
ret.Clear();
|
||||
ret.info.num_row = len;
|
||||
ret.info.num_col = src.info.num_col;
|
||||
|
||||
dmlc::DataIter<RowBatch>* iter = &src;
|
||||
iter->BeforeFirst();
|
||||
CHECK(iter->Next());
|
||||
|
||||
const RowBatch& batch = iter->Value();
|
||||
for (bst_ulong i = 0; i < len; ++i) {
|
||||
const int ridx = idxset[i];
|
||||
RowBatch::Inst inst = batch[ridx];
|
||||
CHECK_LT(static_cast<bst_ulong>(ridx), batch.size);
|
||||
ret.row_data_.resize(ret.row_data_.size() + inst.length);
|
||||
std::memcpy(dmlc::BeginPtr(ret.row_data_) + ret.row_ptr_.back(), inst.data,
|
||||
sizeof(RowBatch::Entry) * inst.length);
|
||||
ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
|
||||
ret.info.num_nonzero += inst.length;
|
||||
|
||||
if (src.info.labels.size() != 0) {
|
||||
ret.info.labels.push_back(src.info.labels[ridx]);
|
||||
}
|
||||
if (src.info.weights.size() != 0) {
|
||||
ret.info.weights.push_back(src.info.weights[ridx]);
|
||||
}
|
||||
if (src.info.root_index.size() != 0) {
|
||||
ret.info.root_index.push_back(src.info.root_index[ridx]);
|
||||
}
|
||||
}
|
||||
*out = DMatrix::Create(std::move(source));
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixFree(DMatrixHandle handle) {
|
||||
API_BEGIN();
|
||||
delete static_cast<DMatrix*>(handle);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixSaveBinary(DMatrixHandle handle,
|
||||
const char* fname,
|
||||
int silent) {
|
||||
API_BEGIN();
|
||||
static_cast<DMatrix*>(handle)->SaveToLocalFile(fname);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixSetFloatInfo(DMatrixHandle handle,
|
||||
const char* field,
|
||||
const float* info,
|
||||
bst_ulong len) {
|
||||
API_BEGIN();
|
||||
static_cast<DMatrix*>(handle)->info().SetInfo(field, info, kFloat32, len);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixSetUIntInfo(DMatrixHandle handle,
|
||||
const char* field,
|
||||
const unsigned* info,
|
||||
bst_ulong len) {
|
||||
API_BEGIN();
|
||||
static_cast<DMatrix*>(handle)->info().SetInfo(field, info, kUInt32, len);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixSetGroup(DMatrixHandle handle,
|
||||
const unsigned* group,
|
||||
bst_ulong len) {
|
||||
API_BEGIN();
|
||||
DMatrix *pmat = static_cast<DMatrix*>(handle);
|
||||
MetaInfo& info = pmat->info();
|
||||
info.group_ptr.resize(len + 1);
|
||||
info.group_ptr[0] = 0;
|
||||
for (uint64_t i = 0; i < len; ++i) {
|
||||
info.group_ptr[i + 1] = info.group_ptr[i] + group[i];
|
||||
}
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
|
||||
const char* field,
|
||||
bst_ulong* out_len,
|
||||
const float** out_dptr) {
|
||||
API_BEGIN();
|
||||
const MetaInfo& info = static_cast<const DMatrix*>(handle)->info();
|
||||
const std::vector<float>* vec = nullptr;
|
||||
if (!std::strcmp(field, "label")) {
|
||||
vec = &info.labels;
|
||||
} else if (!std::strcmp(field, "weight")) {
|
||||
vec = &info.weights;
|
||||
} else if (!std::strcmp(field, "base_margin")) {
|
||||
vec = &info.base_margin;
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown float field name " << field;
|
||||
}
|
||||
*out_len = static_cast<bst_ulong>(vec->size());
|
||||
*out_dptr = dmlc::BeginPtr(*vec);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
|
||||
const char *field,
|
||||
bst_ulong *out_len,
|
||||
const unsigned **out_dptr) {
|
||||
API_BEGIN();
|
||||
const MetaInfo& info = static_cast<const DMatrix*>(handle)->info();
|
||||
const std::vector<unsigned>* vec = nullptr;
|
||||
if (!std::strcmp(field, "root_index")) {
|
||||
vec = &info.root_index;
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown uint field name " << field;
|
||||
}
|
||||
*out_len = static_cast<bst_ulong>(vec->size());
|
||||
*out_dptr = dmlc::BeginPtr(*vec);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixNumRow(const DMatrixHandle handle,
|
||||
bst_ulong *out) {
|
||||
API_BEGIN();
|
||||
*out = static_cast<bst_ulong>(static_cast<const DMatrix*>(handle)->info().num_row);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixNumCol(const DMatrixHandle handle,
|
||||
bst_ulong *out) {
|
||||
API_BEGIN();
|
||||
*out = static_cast<size_t>(static_cast<const DMatrix*>(handle)->info().num_col);
|
||||
API_END();
|
||||
}
|
||||
|
||||
// xgboost implementation
|
||||
int XGBoosterCreate(DMatrixHandle dmats[],
|
||||
bst_ulong len,
|
||||
BoosterHandle *out) {
|
||||
API_BEGIN();
|
||||
std::vector<DMatrix*> mats;
|
||||
for (bst_ulong i = 0; i < len; ++i) {
|
||||
mats.push_back(static_cast<DMatrix*>(dmats[i]));
|
||||
}
|
||||
*out = new Booster(mats);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterFree(BoosterHandle handle) {
|
||||
API_BEGIN();
|
||||
delete static_cast<Booster*>(handle);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterSetParam(BoosterHandle handle,
|
||||
const char *name,
|
||||
const char *value) {
|
||||
API_BEGIN();
|
||||
static_cast<Booster*>(handle)->SetParam(name, value);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterUpdateOneIter(BoosterHandle handle,
|
||||
int iter,
|
||||
DMatrixHandle dtrain) {
|
||||
API_BEGIN();
|
||||
Booster* bst = static_cast<Booster*>(handle);
|
||||
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
|
||||
|
||||
bst->LazyInit();
|
||||
bst->learner()->UpdateOneIter(iter, dtr);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterBoostOneIter(BoosterHandle handle,
|
||||
DMatrixHandle dtrain,
|
||||
float *grad,
|
||||
float *hess,
|
||||
bst_ulong len) {
|
||||
std::vector<bst_gpair>& tmp_gpair = XGBAPIThreadLocalStore::Get()->tmp_gpair;
|
||||
API_BEGIN();
|
||||
Booster* bst = static_cast<Booster*>(handle);
|
||||
DMatrix* dtr = static_cast<DMatrix*>(dtrain);
|
||||
tmp_gpair.resize(len);
|
||||
for (bst_ulong i = 0; i < len; ++i) {
|
||||
tmp_gpair[i] = bst_gpair(grad[i], hess[i]);
|
||||
}
|
||||
|
||||
bst->LazyInit();
|
||||
bst->learner()->BoostOneIter(0, dtr, &tmp_gpair);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterEvalOneIter(BoosterHandle handle,
|
||||
int iter,
|
||||
DMatrixHandle dmats[],
|
||||
const char* evnames[],
|
||||
bst_ulong len,
|
||||
const char** out_str) {
|
||||
std::string& eval_str = XGBAPIThreadLocalStore::Get()->ret_str;
|
||||
API_BEGIN();
|
||||
Booster* bst = static_cast<Booster*>(handle);
|
||||
std::vector<DMatrix*> data_sets;
|
||||
std::vector<std::string> data_names;
|
||||
|
||||
for (bst_ulong i = 0; i < len; ++i) {
|
||||
data_sets.push_back(static_cast<DMatrix*>(dmats[i]));
|
||||
data_names.push_back(std::string(evnames[i]));
|
||||
}
|
||||
|
||||
bst->LazyInit();
|
||||
eval_str = bst->learner()->EvalOneIter(iter, data_sets, data_names);
|
||||
*out_str = eval_str.c_str();
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterPredict(BoosterHandle handle,
|
||||
DMatrixHandle dmat,
|
||||
int option_mask,
|
||||
unsigned ntree_limit,
|
||||
bst_ulong *len,
|
||||
const float **out_result) {
|
||||
std::vector<float>& preds = XGBAPIThreadLocalStore::Get()->ret_vec_float;
|
||||
API_BEGIN();
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
bst->LazyInit();
|
||||
bst->learner()->Predict(
|
||||
static_cast<DMatrix*>(dmat),
|
||||
(option_mask & 1) != 0,
|
||||
&preds, ntree_limit,
|
||||
(option_mask & 2) != 0);
|
||||
*out_result = dmlc::BeginPtr(preds);
|
||||
*len = static_cast<bst_ulong>(preds.size());
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
|
||||
API_BEGIN();
|
||||
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
|
||||
static_cast<Booster*>(handle)->LoadModel(fi.get());
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterSaveModel(BoosterHandle handle, const char* fname) {
|
||||
API_BEGIN();
|
||||
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname, "w"));
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
bst->LazyInit();
|
||||
bst->learner()->Save(fo.get());
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
|
||||
const void* buf,
|
||||
bst_ulong len) {
|
||||
API_BEGIN();
|
||||
common::MemoryFixSizeBuffer fs((void*)buf, len); // NOLINT(*)
|
||||
static_cast<Booster*>(handle)->LoadModel(&fs);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterGetModelRaw(BoosterHandle handle,
|
||||
bst_ulong* out_len,
|
||||
const char** out_dptr) {
|
||||
std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
|
||||
raw_str.resize(0);
|
||||
|
||||
API_BEGIN();
|
||||
common::MemoryBufferStream fo(&raw_str);
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
bst->LazyInit();
|
||||
bst->learner()->Save(&fo);
|
||||
*out_dptr = dmlc::BeginPtr(raw_str);
|
||||
*out_len = static_cast<bst_ulong>(raw_str.length());
|
||||
API_END();
|
||||
}
|
||||
|
||||
inline void XGBoostDumpModelImpl(
|
||||
BoosterHandle handle,
|
||||
const FeatureMap& fmap,
|
||||
int with_stats,
|
||||
bst_ulong* len,
|
||||
const char*** out_models) {
|
||||
std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
|
||||
std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
bst->LazyInit();
|
||||
str_vecs = bst->learner()->Dump2Text(fmap, with_stats != 0);
|
||||
charp_vecs.resize(str_vecs.size());
|
||||
for (size_t i = 0; i < str_vecs.size(); ++i) {
|
||||
charp_vecs[i] = str_vecs[i].c_str();
|
||||
}
|
||||
*out_models = dmlc::BeginPtr(charp_vecs);
|
||||
*len = static_cast<bst_ulong>(charp_vecs.size());
|
||||
}
|
||||
int XGBoosterDumpModel(BoosterHandle handle,
|
||||
const char* fmap,
|
||||
int with_stats,
|
||||
bst_ulong* len,
|
||||
const char*** out_models) {
|
||||
API_BEGIN();
|
||||
FeatureMap featmap;
|
||||
if (strlen(fmap) != 0) {
|
||||
std::unique_ptr<dmlc::Stream> fs(
|
||||
dmlc::Stream::Create(fmap, "r"));
|
||||
dmlc::istream is(fs.get());
|
||||
featmap.LoadText(is);
|
||||
}
|
||||
XGBoostDumpModelImpl(handle, featmap, with_stats, len, out_models);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
|
||||
int fnum,
|
||||
const char** fname,
|
||||
const char** ftype,
|
||||
int with_stats,
|
||||
bst_ulong* len,
|
||||
const char*** out_models) {
|
||||
API_BEGIN();
|
||||
FeatureMap featmap;
|
||||
for (int i = 0; i < fnum; ++i) {
|
||||
featmap.PushBack(i, fname[i], ftype[i]);
|
||||
}
|
||||
XGBoostDumpModelImpl(handle, featmap, with_stats, len, out_models);
|
||||
API_END();
|
||||
}
|
||||
21
src/c_api/c_api_error.cc
Normal file
21
src/c_api/c_api_error.cc
Normal file
@ -0,0 +1,21 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file c_api_error.cc
|
||||
* \brief C error handling
|
||||
*/
|
||||
#include "./c_api_error.h"
|
||||
#include "../common/thread_local.h"
|
||||
|
||||
struct XGBAPIErrorEntry {
|
||||
std::string last_error;
|
||||
};
|
||||
|
||||
typedef xgboost::common::ThreadLocalStore<XGBAPIErrorEntry> XGBAPIErrorStore;
|
||||
|
||||
const char *XGBGetLastError() {
|
||||
return XGBAPIErrorStore::Get()->last_error.c_str();
|
||||
}
|
||||
|
||||
void XGBAPISetLastError(const char* msg) {
|
||||
XGBAPIErrorStore::Get()->last_error = msg;
|
||||
}
|
||||
39
src/c_api/c_api_error.h
Normal file
39
src/c_api/c_api_error.h
Normal file
@ -0,0 +1,39 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file c_api_error.h
|
||||
* \brief Error handling for C API.
|
||||
*/
|
||||
#ifndef XGBOOST_C_API_C_API_ERROR_H_
|
||||
#define XGBOOST_C_API_C_API_ERROR_H_
|
||||
|
||||
#include <dmlc/base.h>
|
||||
#include <dmlc/logging.h>
|
||||
#include <xgboost/c_api.h>
|
||||
|
||||
/*! \brief macro to guard beginning and end section of all functions */
|
||||
#define API_BEGIN() try {
|
||||
/*! \brief every function starts with API_BEGIN();
|
||||
and finishes with API_END() or API_END_HANDLE_ERROR */
|
||||
#define API_END() } catch(dmlc::Error &_except_) { return XGBAPIHandleException(_except_); } return 0; // NOLINT(*)
|
||||
/*!
|
||||
* \brief every function starts with API_BEGIN();
|
||||
* and finishes with API_END() or API_END_HANDLE_ERROR
|
||||
* The finally clause contains procedure to cleanup states when an error happens.
|
||||
*/
|
||||
#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; return XGBAPIHandleException(_except_); } return 0; // NOLINT(*)
|
||||
|
||||
/*!
|
||||
* \brief Set the last error message needed by C API
|
||||
* \param msg The error message to set.
|
||||
*/
|
||||
void XGBAPISetLastError(const char* msg);
|
||||
/*!
|
||||
* \brief handle exception throwed out
|
||||
* \param e the exception
|
||||
* \return the return value of API after exception is handled
|
||||
*/
|
||||
inline int XGBAPIHandleException(const dmlc::Error &e) {
|
||||
XGBAPISetLastError(e.what());
|
||||
return -1;
|
||||
}
|
||||
#endif // XGBOOST_C_API_C_API_ERROR_H_
|
||||
@ -11,8 +11,9 @@
|
||||
|
||||
#include <xgboost/learner.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <dmlc/logging.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <dmlc/timer.h>
|
||||
#include <iomanip>
|
||||
#include <ctime>
|
||||
#include <string>
|
||||
#include <cstdio>
|
||||
@ -107,6 +108,8 @@ struct CLIParam : public dmlc::Parameter<CLIParam> {
|
||||
.describe("Data split mode.");
|
||||
DMLC_DECLARE_FIELD(ntree_limit).set_default(0).set_lower_bound(0)
|
||||
.describe("Number of trees used for prediction, 0 means use all trees.");
|
||||
DMLC_DECLARE_FIELD(pred_margin).set_default(false)
|
||||
.describe("Whether to predict margin value instead of probability.");
|
||||
DMLC_DECLARE_FIELD(dump_stats).set_default(false)
|
||||
.describe("Whether dump the model statistics.");
|
||||
DMLC_DECLARE_FIELD(name_fmap).set_default("NULL")
|
||||
@ -115,7 +118,8 @@ struct CLIParam : public dmlc::Parameter<CLIParam> {
|
||||
.describe("Name of the output dump text file.");
|
||||
// alias
|
||||
DMLC_DECLARE_ALIAS(train_path, data);
|
||||
DMLC_DECLARE_ALIAS(test_path, "test:data");
|
||||
DMLC_DECLARE_ALIAS(test_path, test:data);
|
||||
DMLC_DECLARE_ALIAS(name_fmap, fmap);
|
||||
}
|
||||
// customized configure function of CLIParam
|
||||
inline void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) {
|
||||
@ -149,7 +153,7 @@ DMLC_REGISTER_PARAMETER(CLIParam);
|
||||
void CLITrain(const CLIParam& param) {
|
||||
if (rabit::IsDistributed()) {
|
||||
std::string pname = rabit::GetProcessorName();
|
||||
LOG(INFO) << "start " << pname << ":" << rabit::GetRank();
|
||||
LOG(CONSOLE) << "start " << pname << ":" << rabit::GetRank();
|
||||
}
|
||||
// load in data.
|
||||
std::unique_ptr<DMatrix> dtrain(
|
||||
@ -178,6 +182,8 @@ void CLITrain(const CLIParam& param) {
|
||||
std::unique_ptr<dmlc::Stream> fi(
|
||||
dmlc::Stream::Create(param.model_in.c_str(), "r"));
|
||||
learner->Load(fi.get());
|
||||
} else {
|
||||
learner->InitModel();
|
||||
}
|
||||
}
|
||||
// start training.
|
||||
@ -186,7 +192,7 @@ void CLITrain(const CLIParam& param) {
|
||||
double elapsed = dmlc::GetTime() - start;
|
||||
if (version % 2 == 0) {
|
||||
if (param.silent == 0) {
|
||||
LOG(INFO) << "boosting round " << i << ", " << elapsed << " sec elapsed";
|
||||
LOG(CONSOLE) << "boosting round " << i << ", " << elapsed << " sec elapsed";
|
||||
}
|
||||
learner->UpdateOneIter(i, dtrain.get());
|
||||
if (learner->AllowLazyCheckPoint()) {
|
||||
@ -200,16 +206,18 @@ void CLITrain(const CLIParam& param) {
|
||||
std::string res = learner->EvalOneIter(i, eval_datasets, eval_data_names);
|
||||
if (rabit::IsDistributed()) {
|
||||
if (rabit::GetRank() == 0) {
|
||||
rabit::TrackerPrint(res + "\n");
|
||||
LOG(TRACKER) << res;
|
||||
}
|
||||
} else {
|
||||
if (param.silent < 2) {
|
||||
LOG(INFO) << res;
|
||||
LOG(CONSOLE) << res;
|
||||
}
|
||||
}
|
||||
if (param.save_period != 0 && (i + 1) % param.save_period == 0) {
|
||||
std::ostringstream os;
|
||||
os << param.model_dir << '/' << i + 1 << ".model";
|
||||
os << param.model_dir << '/'
|
||||
<< std::setfill('0') << std::setw(4)
|
||||
<< i + 1 << ".model";
|
||||
std::unique_ptr<dmlc::Stream> fo(
|
||||
dmlc::Stream::Create(os.str().c_str(), "w"));
|
||||
learner->Save(fo.get());
|
||||
@ -228,7 +236,9 @@ void CLITrain(const CLIParam& param) {
|
||||
param.model_out != "NONE") {
|
||||
std::ostringstream os;
|
||||
if (param.model_out == "NULL") {
|
||||
os << param.model_dir << '/' << param.num_round << ".model";
|
||||
os << param.model_dir << '/'
|
||||
<< std::setfill('0') << std::setw(4)
|
||||
<< param.num_round << ".model";
|
||||
} else {
|
||||
os << param.model_out;
|
||||
}
|
||||
@ -239,7 +249,7 @@ void CLITrain(const CLIParam& param) {
|
||||
|
||||
if (param.silent == 0) {
|
||||
double elapsed = dmlc::GetTime() - start;
|
||||
LOG(INFO) << "update end, " << elapsed << " sec in all";
|
||||
LOG(CONSOLE) << "update end, " << elapsed << " sec in all";
|
||||
}
|
||||
}
|
||||
|
||||
@ -272,6 +282,8 @@ void CLIDump2Text(const CLIParam& param) {
|
||||
}
|
||||
|
||||
void CLIPredict(const CLIParam& param) {
|
||||
CHECK_NE(param.test_path, "NULL")
|
||||
<< "Test dataset parameter test:data must be specified.";
|
||||
// load data
|
||||
std::unique_ptr<DMatrix> dtest(
|
||||
DMatrix::Load(param.test_path, param.silent != 0, param.dsplit == 2));
|
||||
@ -284,12 +296,12 @@ void CLIPredict(const CLIParam& param) {
|
||||
learner->Load(fi.get());
|
||||
|
||||
if (param.silent == 0) {
|
||||
LOG(INFO) << "start prediction...";
|
||||
LOG(CONSOLE) << "start prediction...";
|
||||
}
|
||||
std::vector<float> preds;
|
||||
learner->Predict(dtest.get(), param.pred_margin, &preds, param.ntree_limit);
|
||||
if (param.silent == 0) {
|
||||
LOG(INFO) << "writing prediction to " << param.name_pred;
|
||||
LOG(CONSOLE) << "writing prediction to " << param.name_pred;
|
||||
}
|
||||
std::unique_ptr<dmlc::Stream> fo(
|
||||
dmlc::Stream::Create(param.name_pred.c_str(), "w"));
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#ifndef XGBOOST_COMMON_BASE64_H_
|
||||
#define XGBOOST_COMMON_BASE64_H_
|
||||
|
||||
#include <dmlc/logging.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <cctype>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
15
src/common/common.cc
Normal file
15
src/common/common.cc
Normal file
@ -0,0 +1,15 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file common.cc
|
||||
* \brief Enable all kinds of global variables in common.
|
||||
*/
|
||||
#include "./random.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
RandomEngine& GlobalRandom() {
|
||||
static RandomEngine inst;
|
||||
return inst;
|
||||
}
|
||||
}
|
||||
} // namespace xgboost
|
||||
@ -8,7 +8,7 @@
|
||||
#define XGBOOST_COMMON_QUANTILE_H_
|
||||
|
||||
#include <dmlc/base.h>
|
||||
#include <dmlc/logging.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
77
src/common/thread_local.h
Normal file
77
src/common/thread_local.h
Normal file
@ -0,0 +1,77 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file thread_local.h
|
||||
* \brief Common utility for thread local storage.
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_THREAD_LOCAL_H_
|
||||
#define XGBOOST_COMMON_THREAD_LOCAL_H_
|
||||
|
||||
#include <mutex>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
// macro hanlding for threadlocal variables
|
||||
#ifdef __GNUC__
|
||||
#define MX_TREAD_LOCAL __thread
|
||||
#elif __STDC_VERSION__ >= 201112L
|
||||
#define MX_TREAD_LOCAL _Thread_local
|
||||
#elif defined(_MSC_VER)
|
||||
#define MX_TREAD_LOCAL __declspec(thread)
|
||||
#endif
|
||||
|
||||
#ifndef MX_TREAD_LOCAL
|
||||
#message("Warning: Threadlocal is not enabled");
|
||||
#endif
|
||||
|
||||
/*!
|
||||
* \brief A threadlocal store to store threadlocal variables.
|
||||
* Will return a thread local singleton of type T
|
||||
* \tparam T the type we like to store
|
||||
*/
|
||||
template<typename T>
|
||||
class ThreadLocalStore {
|
||||
public:
|
||||
/*! \return get a thread local singleton */
|
||||
static T* Get() {
|
||||
static MX_TREAD_LOCAL T* ptr = nullptr;
|
||||
if (ptr == nullptr) {
|
||||
ptr = new T();
|
||||
Singleton()->RegisterDelete(ptr);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief constructor */
|
||||
ThreadLocalStore() {}
|
||||
/*! \brief destructor */
|
||||
~ThreadLocalStore() {
|
||||
for (size_t i = 0; i < data_.size(); ++i) {
|
||||
delete data_[i];
|
||||
}
|
||||
}
|
||||
/*! \return singleton of the store */
|
||||
static ThreadLocalStore<T> *Singleton() {
|
||||
static ThreadLocalStore<T> inst;
|
||||
return &inst;
|
||||
}
|
||||
/*!
|
||||
* \brief register str for internal deletion
|
||||
* \param str the string pointer
|
||||
*/
|
||||
void RegisterDelete(T *str) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
data_.push_back(str);
|
||||
lock.unlock();
|
||||
}
|
||||
/*! \brief internal mutex */
|
||||
std::mutex mutex_;
|
||||
/*!\brief internal data */
|
||||
std::vector<T*> data_;
|
||||
};
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_THREAD_LOCAL_H_
|
||||
@ -3,7 +3,12 @@
|
||||
* \file data.cc
|
||||
*/
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <cstring>
|
||||
#include "./sparse_batch_page.h"
|
||||
#include "./simple_dmatrix.h"
|
||||
#include "./simple_csr_source.h"
|
||||
#include "../common/io.h"
|
||||
|
||||
namespace xgboost {
|
||||
// implementation of inline functions
|
||||
@ -83,4 +88,83 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
DMatrix* DMatrix::Load(const std::string& uri,
|
||||
bool silent,
|
||||
bool load_row_split,
|
||||
const std::string& file_format) {
|
||||
std::string fname, cache_file;
|
||||
size_t dlm_pos = uri.find('#');
|
||||
if (dlm_pos != std::string::npos) {
|
||||
cache_file = uri.substr(dlm_pos + 1, uri.length());
|
||||
fname = uri.substr(0, dlm_pos);
|
||||
CHECK_EQ(cache_file.find('#'), std::string::npos)
|
||||
<< "Only one `#` is allowed in file path for cache file specification.";
|
||||
if (load_row_split) {
|
||||
std::ostringstream os;
|
||||
os << cache_file << ".r" << rabit::GetRank();
|
||||
cache_file = os.str();
|
||||
}
|
||||
} else {
|
||||
fname = uri;
|
||||
}
|
||||
int partid = 0, npart = 1;
|
||||
if (load_row_split) {
|
||||
partid = rabit::GetRank();
|
||||
npart = rabit::GetWorldSize();
|
||||
}
|
||||
|
||||
// legacy handling of binary data loading
|
||||
if (file_format == "auto" && !load_row_split) {
|
||||
int magic;
|
||||
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
|
||||
common::PeekableInStream is(fi.get());
|
||||
if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) &&
|
||||
magic == data::SimpleCSRSource::kMagic) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
source->LoadBinary(&is);
|
||||
DMatrix* dmat = DMatrix::Create(std::move(source), cache_file);
|
||||
if (!silent) {
|
||||
LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "
|
||||
<< dmat->info().num_nonzero << " entries loaded from " << uri;
|
||||
}
|
||||
return dmat;
|
||||
}
|
||||
}
|
||||
|
||||
std::string ftype = file_format;
|
||||
if (file_format == "auto") ftype = "libsvm";
|
||||
std::unique_ptr<dmlc::Parser<uint32_t> > parser(
|
||||
dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, ftype.c_str()));
|
||||
DMatrix* dmat = DMatrix::Create(parser.get(), cache_file);
|
||||
if (!silent) {
|
||||
LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "
|
||||
<< dmat->info().num_nonzero << " entries loaded from " << uri;
|
||||
}
|
||||
return dmat;
|
||||
}
|
||||
|
||||
DMatrix* DMatrix::Create(dmlc::Parser<uint32_t>* parser,
|
||||
const std::string& cache_prefix) {
|
||||
if (cache_prefix.length() == 0) {
|
||||
std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
|
||||
source->CopyFrom(parser);
|
||||
return DMatrix::Create(std::move(source), cache_prefix);
|
||||
} else {
|
||||
LOG(FATAL) << "external memory not yet implemented";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void DMatrix::SaveToLocalFile(const std::string& fname) {
|
||||
data::SimpleCSRSource source;
|
||||
source.CopyFrom(this);
|
||||
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
|
||||
source.SaveBinary(fo.get());
|
||||
}
|
||||
|
||||
DMatrix* DMatrix::Create(std::unique_ptr<DataSource>&& source,
|
||||
const std::string& cache_prefix) {
|
||||
return new data::SimpleDMatrix(std::move(source));
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
* \file simple_csr_source.cc
|
||||
*/
|
||||
#include <dmlc/base.h>
|
||||
#include <dmlc/logging.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include "./simple_csr_source.h"
|
||||
|
||||
namespace xgboost {
|
||||
@ -80,7 +80,7 @@ void SimpleCSRSource::SaveBinary(dmlc::Stream* fo) const {
|
||||
}
|
||||
|
||||
void SimpleCSRSource::BeforeFirst() {
|
||||
at_first_ = false;
|
||||
at_first_ = true;
|
||||
}
|
||||
|
||||
bool SimpleCSRSource::Next() {
|
||||
|
||||
265
src/data/simple_dmatrix.cc
Normal file
265
src/data/simple_dmatrix.cc
Normal file
@ -0,0 +1,265 @@
|
||||
/*!
|
||||
* Copyright 2014 by Contributors
|
||||
* \file simple_dmatrix.cc
|
||||
* \brief the input data structure for gradient boosting
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <xgboost/data.h>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "./simple_dmatrix.h"
|
||||
#include "../common/random.h"
|
||||
#include "../common/group_data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
bool SimpleDMatrix::ColBatchIter::Next() {
|
||||
if (data_ptr_ >= cpages_.size()) return false;
|
||||
data_ptr_ += 1;
|
||||
SparsePage* pcol = cpages_[data_ptr_ - 1].get();
|
||||
batch_.size = col_index_.size();
|
||||
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
|
||||
for (size_t i = 0; i < col_data_.size(); ++i) {
|
||||
const bst_uint ridx = col_index_[i];
|
||||
col_data_[i] = SparseBatch::Inst
|
||||
(dmlc::BeginPtr(pcol->data) + pcol->offset[ridx],
|
||||
static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
|
||||
}
|
||||
batch_.col_index = dmlc::BeginPtr(col_index_);
|
||||
batch_.col_data = dmlc::BeginPtr(col_data_);
|
||||
return true;
|
||||
}
|
||||
|
||||
dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator() {
|
||||
size_t ncol = this->info().num_col;
|
||||
col_iter_.col_index_.resize(ncol);
|
||||
for (size_t i = 0; i < ncol; ++i) {
|
||||
col_iter_.col_index_[i] = static_cast<bst_uint>(i);
|
||||
}
|
||||
col_iter_.BeforeFirst();
|
||||
return &col_iter_;
|
||||
}
|
||||
|
||||
dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator(const std::vector<bst_uint>&fset) {
|
||||
size_t ncol = this->info().num_col;
|
||||
col_iter_.col_index_.resize(0);
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
|
||||
}
|
||||
col_iter_.BeforeFirst();
|
||||
return &col_iter_;
|
||||
}
|
||||
|
||||
void SimpleDMatrix::InitColAccess(const std::vector<bool> &enabled,
|
||||
float pkeep,
|
||||
size_t max_row_perbatch) {
|
||||
if (this->HaveColAccess()) return;
|
||||
|
||||
col_iter_.cpages_.clear();
|
||||
if (info().num_row < max_row_perbatch) {
|
||||
std::unique_ptr<SparsePage> page(new SparsePage());
|
||||
this->MakeOneBatch(enabled, pkeep, page.get());
|
||||
col_iter_.cpages_.push_back(std::move(page));
|
||||
} else {
|
||||
this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
|
||||
}
|
||||
// setup col-size
|
||||
col_size_.resize(info().num_col);
|
||||
std::fill(col_size_.begin(), col_size_.end(), 0);
|
||||
for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
|
||||
SparsePage *pcol = col_iter_.cpages_[i].get();
|
||||
for (size_t j = 0; j < pcol->Size(); ++j) {
|
||||
col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// internal function to make one batch from row iter.
|
||||
void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled,
|
||||
float pkeep,
|
||||
SparsePage *pcol) {
|
||||
// clear rowset
|
||||
buffered_rowset_.clear();
|
||||
// bit map
|
||||
int nthread;
|
||||
std::vector<bool> bmap;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
|
||||
pcol->Clear();
|
||||
common::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info().num_col, nthread);
|
||||
// start working
|
||||
dmlc::DataIter<RowBatch>* iter = this->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch& batch = iter->Value();
|
||||
bmap.resize(bmap.size() + batch.size, true);
|
||||
std::bernoulli_distribution coin_flip(pkeep);
|
||||
auto& rnd = common::GlobalRandom();
|
||||
|
||||
long batch_size = static_cast<long>(batch.size); // NOLINT(*)
|
||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || coin_flip(rnd)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
} else {
|
||||
bmap[i] = false;
|
||||
}
|
||||
}
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
if (enabled[inst[j].index]) {
|
||||
builder.AddBudget(inst[j].index, tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch& batch = iter->Value();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
if (enabled[inst[j].index]) {
|
||||
builder.Push(inst[j].index,
|
||||
SparseBatch::Entry((bst_uint)(batch.base_rowid+i),
|
||||
inst[j].fvalue), tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_EQ(pcol->Size(), info().num_col);
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
|
||||
dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
|
||||
SparseBatch::Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SimpleDMatrix::MakeManyBatch(const std::vector<bool>& enabled,
|
||||
float pkeep,
|
||||
size_t max_row_perbatch) {
|
||||
size_t btop = 0;
|
||||
std::bernoulli_distribution coin_flip(pkeep);
|
||||
auto& rnd = common::GlobalRandom();
|
||||
buffered_rowset_.clear();
|
||||
// internal temp cache
|
||||
SparsePage tmp; tmp.Clear();
|
||||
// start working
|
||||
dmlc::DataIter<RowBatch>* iter = this->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
|
||||
while (iter->Next()) {
|
||||
const RowBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || coin_flip(rnd)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
tmp.Push(batch[i]);
|
||||
}
|
||||
if (tmp.Size() >= max_row_perbatch) {
|
||||
std::unique_ptr<SparsePage> page(new SparsePage());
|
||||
this->MakeColPage(tmp.GetRowBatch(0),
|
||||
dmlc::BeginPtr(buffered_rowset_) + btop,
|
||||
enabled, page.get());
|
||||
col_iter_.cpages_.push_back(std::move(page));
|
||||
btop = buffered_rowset_.size();
|
||||
tmp.Clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (tmp.Size() != 0) {
|
||||
std::unique_ptr<SparsePage> page(new SparsePage());
|
||||
this->MakeColPage(tmp.GetRowBatch(0),
|
||||
dmlc::BeginPtr(buffered_rowset_) + btop,
|
||||
enabled, page.get());
|
||||
col_iter_.cpages_.push_back(std::move(page));
|
||||
}
|
||||
}
|
||||
|
||||
// make column page from subset of rowbatchs
|
||||
void SimpleDMatrix::MakeColPage(const RowBatch& batch,
|
||||
const bst_uint* ridx,
|
||||
const std::vector<bool>& enabled,
|
||||
SparsePage* pcol) {
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
|
||||
if (nthread > max_nthread) {
|
||||
nthread = max_nthread;
|
||||
}
|
||||
}
|
||||
pcol->Clear();
|
||||
common::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info().num_col, nthread);
|
||||
bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
int tid = omp_get_thread_num();
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
const SparseBatch::Entry &e = inst[j];
|
||||
if (enabled[e.index]) {
|
||||
builder.AddBudget(e.index, tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
int tid = omp_get_thread_num();
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
const SparseBatch::Entry &e = inst[j];
|
||||
builder.Push(e.index,
|
||||
SparseBatch::Entry(ridx[i], e.fvalue),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
CHECK_EQ(pcol->Size(), info().num_col);
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
|
||||
dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
|
||||
SparseBatch::Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool SimpleDMatrix::SingleColBlock() const {
|
||||
return col_iter_.cpages_.size() <= 1;
|
||||
}
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
119
src/data/simple_dmatrix.h
Normal file
119
src/data/simple_dmatrix.h
Normal file
@ -0,0 +1,119 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file simple_dmatrix.h
|
||||
* \brief In-memory version of DMatrix.
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_SIMPLE_DMATRIX_H_
|
||||
#define XGBOOST_DATA_SIMPLE_DMATRIX_H_
|
||||
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include "./sparse_batch_page.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
class SimpleDMatrix : public DMatrix {
|
||||
public:
|
||||
explicit SimpleDMatrix(std::unique_ptr<DataSource>&& source)
|
||||
: source_(std::move(source)) {}
|
||||
|
||||
MetaInfo& info() override {
|
||||
return source_->info;
|
||||
}
|
||||
|
||||
const MetaInfo& info() const override {
|
||||
return source_->info;
|
||||
}
|
||||
|
||||
dmlc::DataIter<RowBatch>* RowIterator() override {
|
||||
dmlc::DataIter<RowBatch>* iter = source_.get();
|
||||
iter->BeforeFirst();
|
||||
return iter;
|
||||
}
|
||||
|
||||
bool HaveColAccess() const override {
|
||||
return col_size_.size() != 0;
|
||||
}
|
||||
|
||||
const std::vector<bst_uint>& buffered_rowset() const override {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
|
||||
size_t GetColSize(size_t cidx) const {
|
||||
return col_size_[cidx];
|
||||
}
|
||||
|
||||
float GetColDensity(size_t cidx) const override {
|
||||
size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
||||
}
|
||||
|
||||
dmlc::DataIter<ColBatch>* ColIterator() override;
|
||||
|
||||
dmlc::DataIter<ColBatch>* ColIterator(const std::vector<bst_uint>& fset) override;
|
||||
|
||||
void InitColAccess(const std::vector<bool>& enabled,
|
||||
float subsample,
|
||||
size_t max_row_perbatch) override;
|
||||
|
||||
bool SingleColBlock() const override;
|
||||
|
||||
private:
|
||||
// in-memory column batch iterator.
|
||||
struct ColBatchIter: dmlc::DataIter<ColBatch> {
|
||||
public:
|
||||
ColBatchIter() : data_ptr_(0) {}
|
||||
void BeforeFirst() override {
|
||||
data_ptr_ = 0;
|
||||
}
|
||||
const ColBatch &Value() const override {
|
||||
return batch_;
|
||||
}
|
||||
bool Next() override;
|
||||
|
||||
private:
|
||||
// allow SimpleDMatrix to access it.
|
||||
friend class SimpleDMatrix;
|
||||
// data content
|
||||
std::vector<bst_uint> col_index_;
|
||||
// column content
|
||||
std::vector<ColBatch::Inst> col_data_;
|
||||
// column sparse pages
|
||||
std::vector<std::unique_ptr<SparsePage> > cpages_;
|
||||
// data pointer
|
||||
size_t data_ptr_;
|
||||
// temporal space for batch
|
||||
ColBatch batch_;
|
||||
};
|
||||
|
||||
// source data pointer.
|
||||
std::unique_ptr<DataSource> source_;
|
||||
// column iterator
|
||||
ColBatchIter col_iter_;
|
||||
// list of row index that are buffered.
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
/*! \brief sizeof column data */
|
||||
std::vector<size_t> col_size_;
|
||||
|
||||
// internal function to make one batch from row iter.
|
||||
void MakeOneBatch(const std::vector<bool>& enabled,
|
||||
float pkeep,
|
||||
SparsePage *pcol);
|
||||
|
||||
void MakeManyBatch(const std::vector<bool>& enabled,
|
||||
float pkeep,
|
||||
size_t max_row_perbatch);
|
||||
|
||||
void MakeColPage(const RowBatch& batch,
|
||||
const bst_uint* ridx,
|
||||
const std::vector<bool>& enabled,
|
||||
SparsePage* pcol);
|
||||
};
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_SIMPLE_DMATRIX_H_
|
||||
@ -6,17 +6,18 @@
|
||||
* use in external memory computation
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
|
||||
#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
|
||||
#ifndef XGBOOST_DATA_SPARSE_BATCH_PAGE_H_
|
||||
#define XGBOOST_DATA_SPARSE_BATCH_PAGE_H_
|
||||
|
||||
#include <xgboost/data.h>
|
||||
#include <dmlc/io.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "../data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
namespace data {
|
||||
/*!
|
||||
* \brief storage unit of sparse batch
|
||||
* \brief in-memory storage unit of sparse batch
|
||||
*/
|
||||
class SparsePage {
|
||||
public:
|
||||
@ -24,6 +25,7 @@ class SparsePage {
|
||||
std::vector<size_t> offset;
|
||||
/*! \brief the data of the segments */
|
||||
std::vector<SparseBatch::Entry> data;
|
||||
|
||||
/*! \brief constructor */
|
||||
SparsePage() {
|
||||
this->Clear();
|
||||
@ -38,14 +40,14 @@ class SparsePage {
|
||||
* \param sorted_index_set sorted index of segments we are interested in
|
||||
* \return true of the loading as successful, false if end of file was reached
|
||||
*/
|
||||
inline bool Load(utils::ISeekStream *fi,
|
||||
inline bool Load(dmlc::SeekStream *fi,
|
||||
const std::vector<bst_uint> &sorted_index_set) {
|
||||
if (!fi->Read(&disk_offset_)) return false;
|
||||
// setup the offset
|
||||
offset.clear(); offset.push_back(0);
|
||||
for (size_t i = 0; i < sorted_index_set.size(); ++i) {
|
||||
bst_uint fid = sorted_index_set[i];
|
||||
utils::Check(fid + 1 < disk_offset_.size(), "bad col.blob format");
|
||||
CHECK_LT(fid + 1, disk_offset_.size());
|
||||
size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
|
||||
offset.push_back(offset.back() + size);
|
||||
}
|
||||
@ -56,7 +58,7 @@ class SparsePage {
|
||||
for (size_t i = 0; i < sorted_index_set.size();) {
|
||||
bst_uint fid = sorted_index_set[i];
|
||||
if (disk_offset_[fid] != curr_offset) {
|
||||
utils::Assert(disk_offset_[fid] > curr_offset, "fset index was not sorted");
|
||||
CHECK_GT(disk_offset_[fid], curr_offset);
|
||||
fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry));
|
||||
curr_offset = disk_offset_[fid];
|
||||
}
|
||||
@ -68,10 +70,12 @@ class SparsePage {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (size_to_read != 0) {
|
||||
utils::Check(fi->Read(BeginPtr(data) + offset[i],
|
||||
size_to_read * sizeof(SparseBatch::Entry)) != 0,
|
||||
"Invalid SparsePage file");
|
||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(data) + offset[i],
|
||||
size_to_read * sizeof(SparseBatch::Entry)),
|
||||
size_to_read * sizeof(SparseBatch::Entry))
|
||||
<< "Invalid SparsePage file";
|
||||
curr_offset += size_to_read;
|
||||
}
|
||||
i = j;
|
||||
@ -87,13 +91,14 @@ class SparsePage {
|
||||
* \param fi the input stream of the file
|
||||
* \return true of the loading as successful, false if end of file was reached
|
||||
*/
|
||||
inline bool Load(utils::IStream *fi) {
|
||||
inline bool Load(dmlc::Stream *fi) {
|
||||
if (!fi->Read(&offset)) return false;
|
||||
utils::Check(offset.size() != 0, "Invalid SparsePage file");
|
||||
CHECK_NE(offset.size(), 0) << "Invalid SparsePage file";
|
||||
data.resize(offset.back());
|
||||
if (data.size() != 0) {
|
||||
utils::Check(fi->Read(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)) != 0,
|
||||
"Invalid SparsePage file");
|
||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)),
|
||||
data.size() * sizeof(SparseBatch::Entry))
|
||||
<< "Invalid SparsePage file";
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -102,12 +107,12 @@ class SparsePage {
|
||||
* to disk it must contain all the elements in the
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void Save(utils::IStream *fo) const {
|
||||
utils::Assert(offset.size() != 0 && offset[0] == 0, "bad offset");
|
||||
utils::Assert(offset.back() == data.size(), "in consistent SparsePage");
|
||||
inline void Save(dmlc::Stream *fo) const {
|
||||
CHECK(offset.size() != 0 && offset[0] == 0);
|
||||
CHECK_EQ(offset.back(), data.size());
|
||||
fo->Write(offset);
|
||||
if (data.size() != 0) {
|
||||
fo->Write(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry));
|
||||
fo->Write(dmlc::BeginPtr(data), data.size() * sizeof(SparseBatch::Entry));
|
||||
}
|
||||
}
|
||||
/*! \return estimation of memory cost of this page */
|
||||
@ -125,13 +130,14 @@ class SparsePage {
|
||||
* \param fi the input stream of the file
|
||||
* \return true of the loading as successful, false if end of file was reached
|
||||
*/
|
||||
inline bool PushLoad(utils::IStream *fi) {
|
||||
inline bool PushLoad(dmlc::Stream *fi) {
|
||||
if (!fi->Read(&disk_offset_)) return false;
|
||||
data.resize(offset.back() + disk_offset_.back());
|
||||
if (disk_offset_.back() != 0) {
|
||||
utils::Check(fi->Read(BeginPtr(data) + offset.back(),
|
||||
disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0,
|
||||
"Invalid SparsePage file");
|
||||
CHECK_EQ(fi->Read(dmlc::BeginPtr(data) + offset.back(),
|
||||
disk_offset_.back() * sizeof(SparseBatch::Entry)),
|
||||
disk_offset_.back() * sizeof(SparseBatch::Entry))
|
||||
<< "Invalid SparsePage file";
|
||||
}
|
||||
size_t top = offset.back();
|
||||
size_t begin = offset.size();
|
||||
@ -147,7 +153,7 @@ class SparsePage {
|
||||
*/
|
||||
inline void Push(const RowBatch &batch) {
|
||||
data.resize(offset.back() + batch.ind_ptr[batch.size]);
|
||||
std::memcpy(BeginPtr(data) + offset.back(),
|
||||
std::memcpy(dmlc::BeginPtr(data) + offset.back(),
|
||||
batch.data_ptr + batch.ind_ptr[0],
|
||||
sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]);
|
||||
size_t top = offset.back();
|
||||
@ -164,8 +170,8 @@ class SparsePage {
|
||||
inline void Push(const SparsePage &batch) {
|
||||
size_t top = offset.back();
|
||||
data.resize(top + batch.data.size());
|
||||
std::memcpy(BeginPtr(data) + top,
|
||||
BeginPtr(batch.data),
|
||||
std::memcpy(dmlc::BeginPtr(data) + top,
|
||||
dmlc::BeginPtr(batch.data),
|
||||
sizeof(SparseBatch::Entry) * batch.data.size());
|
||||
size_t begin = offset.size();
|
||||
offset.resize(begin + batch.Size());
|
||||
@ -182,7 +188,7 @@ class SparsePage {
|
||||
size_t begin = data.size();
|
||||
data.resize(begin + inst.length);
|
||||
if (inst.length != 0) {
|
||||
std::memcpy(BeginPtr(data) + begin, inst.data,
|
||||
std::memcpy(dmlc::BeginPtr(data) + begin, inst.data,
|
||||
sizeof(SparseBatch::Entry) * inst.length);
|
||||
}
|
||||
}
|
||||
@ -193,8 +199,8 @@ class SparsePage {
|
||||
inline RowBatch GetRowBatch(size_t base_rowid) const {
|
||||
RowBatch out;
|
||||
out.base_rowid = base_rowid;
|
||||
out.ind_ptr = BeginPtr(offset);
|
||||
out.data_ptr = BeginPtr(data);
|
||||
out.ind_ptr = dmlc::BeginPtr(offset);
|
||||
out.data_ptr = dmlc::BeginPtr(data);
|
||||
out.size = offset.size() - 1;
|
||||
return out;
|
||||
}
|
||||
@ -203,70 +209,6 @@ class SparsePage {
|
||||
/*! \brief external memory column offset */
|
||||
std::vector<size_t> disk_offset_;
|
||||
};
|
||||
/*!
|
||||
* \brief factory class for SparsePage,
|
||||
* used in threadbuffer template
|
||||
*/
|
||||
class SparsePageFactory {
|
||||
public:
|
||||
SparsePageFactory(void)
|
||||
: action_load_all_(true), set_load_all_(true) {}
|
||||
inline void SetFile(const utils::FileStream &fi,
|
||||
size_t file_begin = 0) {
|
||||
fi_ = fi;
|
||||
file_begin_ = file_begin;
|
||||
}
|
||||
inline const std::vector<bst_uint> &index_set(void) const {
|
||||
return action_index_set_;
|
||||
}
|
||||
// set index set, will be used after next before first
|
||||
inline void SetIndexSet(const std::vector<bst_uint> &index_set,
|
||||
bool load_all) {
|
||||
set_load_all_ = load_all;
|
||||
if (!set_load_all_) {
|
||||
set_index_set_ = index_set;
|
||||
std::sort(set_index_set_.begin(), set_index_set_.end());
|
||||
}
|
||||
}
|
||||
inline bool Init(void) {
|
||||
return true;
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {}
|
||||
inline bool LoadNext(SparsePage *val) {
|
||||
if (!action_load_all_) {
|
||||
if (action_index_set_.size() == 0) {
|
||||
return false;
|
||||
} else {
|
||||
return val->Load(&fi_, action_index_set_);
|
||||
}
|
||||
} else {
|
||||
return val->Load(&fi_);
|
||||
}
|
||||
}
|
||||
inline SparsePage *Create(void) {
|
||||
return new SparsePage();
|
||||
}
|
||||
inline void FreeSpace(SparsePage *a) {
|
||||
delete a;
|
||||
}
|
||||
inline void Destroy(void) {
|
||||
fi_.Close();
|
||||
}
|
||||
inline void BeforeFirst(void) {
|
||||
fi_.Seek(file_begin_);
|
||||
action_load_all_ = set_load_all_;
|
||||
if (!set_load_all_) {
|
||||
action_index_set_ = set_index_set_;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool action_load_all_, set_load_all_;
|
||||
size_t file_begin_;
|
||||
utils::FileStream fi_;
|
||||
std::vector<bst_uint> action_index_set_;
|
||||
std::vector<bst_uint> set_index_set_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_SPARSE_BATCH_PAGE_H_
|
||||
#endif // XGBOOST_DATA_SPARSE_BATCH_PAGE_H_
|
||||
@ -5,10 +5,10 @@
|
||||
* the update rule is parallel coordinate descent (shotgun)
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <dmlc/logging.h>
|
||||
#include <dmlc/omp.h>
|
||||
#include <dmlc/parameter.h>
|
||||
#include <xgboost/gbm.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
@ -17,6 +17,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(gblinear);
|
||||
|
||||
// model parameter
|
||||
struct GBLinearModelParam :public dmlc::Parameter<GBLinearModelParam> {
|
||||
// number of feature dimension
|
||||
@ -168,6 +171,9 @@ class GBLinear : public GradientBooster {
|
||||
int64_t buffer_offset,
|
||||
std::vector<float> *out_preds,
|
||||
unsigned ntree_limit) override {
|
||||
if (model.weight.size() == 0) {
|
||||
model.InitModel();
|
||||
}
|
||||
CHECK_EQ(ntree_limit, 0)
|
||||
<< "GBLinear::Predict ntrees is only valid for gbtree predictor";
|
||||
std::vector<float> &preds = *out_preds;
|
||||
@ -293,4 +299,3 @@ XGBOOST_REGISTER_GBM(GBLinear, "gblinear")
|
||||
});
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
29
src/gbm/gbm.cc
Normal file
29
src/gbm/gbm.cc
Normal file
@ -0,0 +1,29 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file gbm.cc
|
||||
* \brief Registry of gradient boosters.
|
||||
*/
|
||||
#include <xgboost/gbm.h>
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
namespace dmlc {
|
||||
DMLC_REGISTRY_ENABLE(::xgboost::GradientBoosterReg);
|
||||
} // namespace dmlc
|
||||
|
||||
namespace xgboost {
|
||||
GradientBooster* GradientBooster::Create(const std::string& name) {
|
||||
auto *e = ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->Find(name);
|
||||
if (e == nullptr) {
|
||||
LOG(FATAL) << "Unknown gbm type " << name;
|
||||
}
|
||||
return (e->body)();
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
// List of files that will be force linked in static links.
|
||||
DMLC_REGISTRY_LINK_TAG(gblinear);
|
||||
DMLC_REGISTRY_LINK_TAG(gbtree);
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
@ -4,9 +4,9 @@
|
||||
* \brief gradient boosted tree implementation.
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <dmlc/logging.h>
|
||||
#include <dmlc/omp.h>
|
||||
#include <dmlc/parameter.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <xgboost/gbm.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
|
||||
@ -19,6 +19,8 @@
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(gbtree);
|
||||
|
||||
/*! \brief training parameters */
|
||||
struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
|
||||
/*! \brief number of threads */
|
||||
@ -482,4 +484,3 @@ XGBOOST_REGISTER_GBM(GBTree, "gbtree")
|
||||
});
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@ -1,72 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file global.cc
|
||||
* \brief Enable all kinds of global static registry and variables.
|
||||
*/
|
||||
#include <xgboost/objective.h>
|
||||
#include <xgboost/metric.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
#include <xgboost/gbm.h>
|
||||
#include "./common/random.h"
|
||||
#include "./common/base64.h"
|
||||
|
||||
namespace dmlc {
|
||||
DMLC_REGISTRY_ENABLE(::xgboost::ObjFunctionReg);
|
||||
DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
|
||||
DMLC_REGISTRY_ENABLE(::xgboost::TreeUpdaterReg);
|
||||
DMLC_REGISTRY_ENABLE(::xgboost::GradientBoosterReg);
|
||||
} // namespace dmlc
|
||||
|
||||
namespace xgboost {
|
||||
// implement factory functions
|
||||
ObjFunction* ObjFunction::Create(const std::string& name) {
|
||||
auto *e = ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->Find(name);
|
||||
if (e == nullptr) {
|
||||
LOG(FATAL) << "Unknown objective function " << name;
|
||||
}
|
||||
return (e->body)();
|
||||
}
|
||||
|
||||
Metric* Metric::Create(const std::string& name) {
|
||||
std::string buf = name;
|
||||
std::string prefix = name;
|
||||
auto pos = buf.find('@');
|
||||
if (pos == std::string::npos) {
|
||||
auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(name);
|
||||
if (e == nullptr) {
|
||||
LOG(FATAL) << "Unknown objective function " << name;
|
||||
}
|
||||
return (e->body)(nullptr);
|
||||
} else {
|
||||
std::string prefix = buf.substr(0, pos);
|
||||
auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
|
||||
if (e == nullptr) {
|
||||
LOG(FATAL) << "Unknown objective function " << name;
|
||||
}
|
||||
return (e->body)(buf.substr(pos + 1, buf.length()).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
TreeUpdater* TreeUpdater::Create(const std::string& name) {
|
||||
auto *e = ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->Find(name);
|
||||
if (e == nullptr) {
|
||||
LOG(FATAL) << "Unknown tree updater " << name;
|
||||
}
|
||||
return (e->body)();
|
||||
}
|
||||
|
||||
GradientBooster* GradientBooster::Create(const std::string& name) {
|
||||
auto *e = ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->Find(name);
|
||||
if (e == nullptr) {
|
||||
LOG(FATAL) << "Unknown gbm type " << name;
|
||||
}
|
||||
return (e->body)();
|
||||
}
|
||||
|
||||
namespace common {
|
||||
RandomEngine& GlobalRandom() {
|
||||
static RandomEngine inst;
|
||||
return inst;
|
||||
}
|
||||
}
|
||||
} // namespace xgboost
|
||||
@ -11,6 +11,7 @@
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <limits>
|
||||
#include <iomanip>
|
||||
#include "./common/io.h"
|
||||
#include "./common/random.h"
|
||||
|
||||
@ -94,6 +95,9 @@ struct LearnerTrainParam
|
||||
}
|
||||
};
|
||||
|
||||
DMLC_REGISTER_PARAMETER(LearnerModelParam);
|
||||
DMLC_REGISTER_PARAMETER(LearnerTrainParam);
|
||||
|
||||
/*!
|
||||
* \brief learner that performs gradient boosting for a specific objective function.
|
||||
* It does training and prediction.
|
||||
@ -144,6 +148,9 @@ class LearnerImpl : public Learner {
|
||||
|
||||
if (cfg_.count("num_class") != 0) {
|
||||
cfg_["num_output_group"] = cfg_["num_class"];
|
||||
if (atoi(cfg_["num_class"].c_str()) > 1 && cfg_.count("objective") == 0) {
|
||||
cfg_["objective"] = "multi:softmax";
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg_.count("max_delta_step") == 0 &&
|
||||
@ -187,6 +194,10 @@ class LearnerImpl : public Learner {
|
||||
}
|
||||
}
|
||||
|
||||
void InitModel() override {
|
||||
this->LazyInitModel();
|
||||
}
|
||||
|
||||
void Load(dmlc::Stream* fi) override {
|
||||
// TODO(tqchen) mark deprecation of old format.
|
||||
common::PeekableInStream fp(fi);
|
||||
@ -202,7 +213,6 @@ class LearnerImpl : public Learner {
|
||||
}
|
||||
// use the peekable reader.
|
||||
fi = &fp;
|
||||
std::string name_gbm, name_obj;
|
||||
// read parameter
|
||||
CHECK_EQ(fi->Read(&mparam, sizeof(mparam)), sizeof(mparam))
|
||||
<< "BoostLearner: wrong model format";
|
||||
@ -218,7 +228,7 @@ class LearnerImpl : public Learner {
|
||||
len = len >> static_cast<uint64_t>(32UL);
|
||||
}
|
||||
if (len != 0) {
|
||||
name_obj.resize(len);
|
||||
name_obj_.resize(len);
|
||||
CHECK_EQ(fi->Read(&name_obj_[0], len), len)
|
||||
<<"BoostLearner: wrong model format";
|
||||
}
|
||||
@ -226,8 +236,10 @@ class LearnerImpl : public Learner {
|
||||
CHECK(fi->Read(&name_gbm_))
|
||||
<< "BoostLearner: wrong model format";
|
||||
// duplicated code with LazyInitModel
|
||||
obj_.reset(ObjFunction::Create(cfg_.at(name_obj_)));
|
||||
gbm_.reset(GradientBooster::Create(cfg_.at(name_gbm_)));
|
||||
obj_.reset(ObjFunction::Create(name_obj_));
|
||||
gbm_.reset(GradientBooster::Create(name_gbm_));
|
||||
gbm_->Load(fi);
|
||||
|
||||
if (metrics_.size() == 0) {
|
||||
metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric()));
|
||||
}
|
||||
@ -246,11 +258,12 @@ class LearnerImpl : public Learner {
|
||||
}
|
||||
|
||||
void UpdateOneIter(int iter, DMatrix* train) override {
|
||||
CHECK(ModelInitialized())
|
||||
<< "Always call InitModel or LoadModel before update";
|
||||
if (tparam.seed_per_iteration || rabit::IsDistributed()) {
|
||||
common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
|
||||
}
|
||||
this->LazyInitDMatrix(train);
|
||||
this->LazyInitModel();
|
||||
this->PredictRaw(train, &preds_);
|
||||
obj_->GetGradient(preds_, train->info(), iter, &gpair_);
|
||||
gbm_->DoBoost(train, this->FindBufferOffset(train), &gpair_);
|
||||
@ -262,6 +275,7 @@ class LearnerImpl : public Learner {
|
||||
if (tparam.seed_per_iteration || rabit::IsDistributed()) {
|
||||
common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
|
||||
}
|
||||
this->LazyInitDMatrix(train);
|
||||
gbm_->DoBoost(train, this->FindBufferOffset(train), in_gpair);
|
||||
}
|
||||
|
||||
@ -269,7 +283,8 @@ class LearnerImpl : public Learner {
|
||||
const std::vector<DMatrix*>& data_sets,
|
||||
const std::vector<std::string>& data_names) override {
|
||||
std::ostringstream os;
|
||||
os << '[' << iter << ']';
|
||||
os << '[' << iter << ']'
|
||||
<< std::setiosflags(std::ios::fixed);
|
||||
for (size_t i = 0; i < data_sets.size(); ++i) {
|
||||
this->PredictRaw(data_sets[i], &preds_);
|
||||
obj_->EvalTransform(&preds_);
|
||||
@ -347,8 +362,6 @@ class LearnerImpl : public Learner {
|
||||
if (num_feature > mparam.num_feature) {
|
||||
mparam.num_feature = num_feature;
|
||||
}
|
||||
// reset the base score
|
||||
mparam.base_score = obj_->ProbToMargin(mparam.base_score);
|
||||
|
||||
// setup
|
||||
cfg_["num_feature"] = ToString(mparam.num_feature);
|
||||
@ -357,9 +370,13 @@ class LearnerImpl : public Learner {
|
||||
gbm_.reset(GradientBooster::Create(name_gbm_));
|
||||
gbm_->Configure(cfg_.begin(), cfg_.end());
|
||||
obj_->Configure(cfg_.begin(), cfg_.end());
|
||||
|
||||
// reset the base score
|
||||
mparam.base_score = obj_->ProbToMargin(mparam.base_score);
|
||||
if (metrics_.size() == 0) {
|
||||
metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric()));
|
||||
}
|
||||
|
||||
this->base_score_ = mparam.base_score;
|
||||
gbm_->ResetPredBuffer(pred_buffer_size_);
|
||||
}
|
||||
@ -373,6 +390,8 @@ class LearnerImpl : public Learner {
|
||||
inline void PredictRaw(DMatrix* data,
|
||||
std::vector<float>* out_preds,
|
||||
unsigned ntree_limit = 0) const {
|
||||
CHECK(gbm_.get() != nullptr)
|
||||
<< "Predict must happen after Load or InitModel";
|
||||
gbm_->Predict(data,
|
||||
this->FindBufferOffset(data),
|
||||
out_preds,
|
||||
|
||||
20
src/logging.cc
Normal file
20
src/logging.cc
Normal file
@ -0,0 +1,20 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file logging.cc
|
||||
* \brief Implementation of loggers.
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <xgboost/logging.h>
|
||||
#include <iostream>
|
||||
#include "./common/sync.h"
|
||||
|
||||
namespace xgboost {
|
||||
ConsoleLogger::~ConsoleLogger() {
|
||||
std::cout << log_stream_.str() << std::endl;
|
||||
}
|
||||
|
||||
TrackerLogger::~TrackerLogger() {
|
||||
log_stream_ << '\n';
|
||||
rabit::TrackerPrint(log_stream_.str());
|
||||
}
|
||||
} // namespace xgboost
|
||||
@ -5,12 +5,16 @@
|
||||
* \author Kailong Chen, Tianqi Chen
|
||||
*/
|
||||
#include <xgboost/metric.h>
|
||||
#include <dmlc/registry.h>
|
||||
#include <cmath>
|
||||
#include "../common/math.h"
|
||||
#include "../common/sync.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
// tag the this file, used by force static link later.
|
||||
DMLC_REGISTRY_FILE_TAG(elementwise_metric);
|
||||
|
||||
/*!
|
||||
* \brief base class of element-wise evaluation
|
||||
* \tparam Derived the name of subclass
|
||||
@ -124,4 +128,3 @@ XGBOOST_REGISTER_METRIC(PossionNegLoglik, "poisson-nloglik")
|
||||
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
42
src/metric/metric.cc
Normal file
42
src/metric/metric.cc
Normal file
@ -0,0 +1,42 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file metric_registry.cc
|
||||
* \brief Registry of objective functions.
|
||||
*/
|
||||
#include <xgboost/metric.h>
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
namespace dmlc {
|
||||
DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
|
||||
}
|
||||
|
||||
namespace xgboost {
|
||||
Metric* Metric::Create(const std::string& name) {
|
||||
std::string buf = name;
|
||||
std::string prefix = name;
|
||||
auto pos = buf.find('@');
|
||||
if (pos == std::string::npos) {
|
||||
auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(name);
|
||||
if (e == nullptr) {
|
||||
LOG(FATAL) << "Unknown metric function " << name;
|
||||
}
|
||||
return (e->body)(nullptr);
|
||||
} else {
|
||||
std::string prefix = buf.substr(0, pos);
|
||||
auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str());
|
||||
if (e == nullptr) {
|
||||
LOG(FATAL) << "Unknown metric function " << name;
|
||||
}
|
||||
return (e->body)(buf.substr(pos + 1, buf.length()).c_str());
|
||||
}
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
// List of files that will be force linked in static links.
|
||||
DMLC_REGISTRY_LINK_TAG(elementwise_metric);
|
||||
DMLC_REGISTRY_LINK_TAG(multiclass_metric);
|
||||
DMLC_REGISTRY_LINK_TAG(rank_metric);
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
@ -11,6 +11,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
// tag the this file, used by force static link later.
|
||||
DMLC_REGISTRY_FILE_TAG(multiclass_metric);
|
||||
|
||||
/*!
|
||||
* \brief base class of multi-class evaluation
|
||||
* \tparam Derived the name of subclass
|
||||
@ -114,4 +117,3 @@ XGBOOST_REGISTER_METRIC(MultiLogLoss, "mlogloss")
|
||||
.set_body([](const char* param) { return new EvalMultiLogLoss(); });
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@ -5,12 +5,16 @@
|
||||
* \author Kailong Chen, Tianqi Chen
|
||||
*/
|
||||
#include <xgboost/metric.h>
|
||||
#include <dmlc/registry.h>
|
||||
#include <cmath>
|
||||
#include "../common/sync.h"
|
||||
#include "../common/math.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace metric {
|
||||
// tag the this file, used by force static link later.
|
||||
DMLC_REGISTRY_FILE_TAG(rank_metric);
|
||||
|
||||
/*! \brief AMS: also records best threshold */
|
||||
struct EvalAMS : public Metric {
|
||||
public:
|
||||
|
||||
@ -4,9 +4,9 @@
|
||||
* \brief Definition of multi-class classification objectives.
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <dmlc/logging.h>
|
||||
#include <dmlc/omp.h>
|
||||
#include <dmlc/parameter.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <xgboost/objective.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
@ -16,6 +16,8 @@
|
||||
namespace xgboost {
|
||||
namespace obj {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(multiclass_obj);
|
||||
|
||||
struct SoftmaxMultiClassParam : public dmlc::Parameter<SoftmaxMultiClassParam> {
|
||||
int num_class;
|
||||
// declare parameters
|
||||
|
||||
34
src/objective/objective.cc
Normal file
34
src/objective/objective.cc
Normal file
@ -0,0 +1,34 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file objective.cc
|
||||
* \brief Registry of all objective functions.
|
||||
*/
|
||||
#include <xgboost/objective.h>
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
namespace dmlc {
|
||||
DMLC_REGISTRY_ENABLE(::xgboost::ObjFunctionReg);
|
||||
} // namespace dmlc
|
||||
|
||||
namespace xgboost {
|
||||
// implement factory functions
|
||||
ObjFunction* ObjFunction::Create(const std::string& name) {
|
||||
auto *e = ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->Find(name);
|
||||
if (e == nullptr) {
|
||||
for (const auto& entry : ::dmlc::Registry< ::xgboost::ObjFunctionReg>::List()) {
|
||||
LOG(INFO) << "Objective candidate: " << entry->name;
|
||||
}
|
||||
LOG(FATAL) << "Unknown objective function " << name;
|
||||
}
|
||||
return (e->body)();
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
namespace xgboost {
|
||||
namespace obj {
|
||||
// List of files that will be force linked in static links.
|
||||
DMLC_REGISTRY_LINK_TAG(regression_obj);
|
||||
DMLC_REGISTRY_LINK_TAG(multiclass_obj);
|
||||
DMLC_REGISTRY_LINK_TAG(rank_obj);
|
||||
} // namespace obj
|
||||
} // namespace xgboost
|
||||
@ -4,8 +4,8 @@
|
||||
* \brief Definition of rank loss.
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include <dmlc/logging.h>
|
||||
#include <dmlc/omp.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <xgboost/objective.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
@ -16,6 +16,8 @@
|
||||
namespace xgboost {
|
||||
namespace obj {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(rank_obj);
|
||||
|
||||
struct LambdaRankParam : public dmlc::Parameter<LambdaRankParam> {
|
||||
int num_pairsample;
|
||||
float fix_list_weight;
|
||||
@ -324,4 +326,3 @@ XGBOOST_REGISTER_OBJECTIVE(LambdaRankObjMAP, "rank:map")
|
||||
|
||||
} // namespace obj
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@ -4,8 +4,8 @@
|
||||
* \brief Definition of single-value regression and classification objectives.
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include <dmlc/logging.h>
|
||||
#include <dmlc/omp.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <xgboost/objective.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
@ -14,6 +14,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace obj {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(regression_obj);
|
||||
|
||||
// common regressions
|
||||
// linear regression
|
||||
struct LinearSquareLoss {
|
||||
@ -84,7 +87,9 @@ class RegLossObj : public ObjFunction {
|
||||
int iter,
|
||||
std::vector<bst_gpair> *out_gpair) override {
|
||||
CHECK_NE(info.labels.size(), 0) << "label set cannot be empty";
|
||||
CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided";
|
||||
CHECK_EQ(preds.size(), info.labels.size())
|
||||
<< "labels are not correctly provided"
|
||||
<< "preds.size=" << preds.size() << ", label.size=" << info.labels.size();
|
||||
out_gpair->resize(preds.size());
|
||||
// check if label in range
|
||||
bool label_correct = true;
|
||||
@ -95,7 +100,7 @@ class RegLossObj : public ObjFunction {
|
||||
float p = Loss::PredTransform(preds[i]);
|
||||
float w = info.GetWeight(i);
|
||||
if (info.labels[i] == 1.0f) w *= param_.scale_pos_weight;
|
||||
if (Loss::CheckLabel(info.labels[i])) label_correct = false;
|
||||
if (!Loss::CheckLabel(info.labels[i])) label_correct = false;
|
||||
out_gpair->at(i) = bst_gpair(Loss::FirstOrderGradient(p, info.labels[i]) * w,
|
||||
Loss::SecondOrderGradient(p, info.labels[i]) * w);
|
||||
}
|
||||
|
||||
@ -71,7 +71,7 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
|
||||
.describe("L2 regularization on leaf weight");
|
||||
DMLC_DECLARE_FIELD(reg_alpha).set_lower_bound(0.0f).set_default(0.0f)
|
||||
.describe("L1 regularization on leaf weight");
|
||||
DMLC_DECLARE_FIELD(default_direction)
|
||||
DMLC_DECLARE_FIELD(default_direction).set_default(0)
|
||||
.add_enum("learn", 0)
|
||||
.add_enum("left", 1)
|
||||
.add_enum("right", 2)
|
||||
|
||||
35
src/tree/tree_updater.cc
Normal file
35
src/tree/tree_updater.cc
Normal file
@ -0,0 +1,35 @@
|
||||
/*!
|
||||
* Copyright 2015 by Contributors
|
||||
* \file tree_updater.cc
|
||||
* \brief Registry of tree updaters.
|
||||
*/
|
||||
#include <xgboost/tree_updater.h>
|
||||
#include <dmlc/registry.h>
|
||||
|
||||
namespace dmlc {
|
||||
DMLC_REGISTRY_ENABLE(::xgboost::TreeUpdaterReg);
|
||||
} // namespace dmlc
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
TreeUpdater* TreeUpdater::Create(const std::string& name) {
|
||||
auto *e = ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->Find(name);
|
||||
if (e == nullptr) {
|
||||
LOG(FATAL) << "Unknown tree updater " << name;
|
||||
}
|
||||
return (e->body)();
|
||||
}
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
// List of files that will be force linked in static links.
|
||||
DMLC_REGISTRY_LINK_TAG(updater_colmaker);
|
||||
DMLC_REGISTRY_LINK_TAG(updater_skmaker);
|
||||
DMLC_REGISTRY_LINK_TAG(updater_refresh);
|
||||
DMLC_REGISTRY_LINK_TAG(updater_prune);
|
||||
DMLC_REGISTRY_LINK_TAG(updater_histmaker);
|
||||
DMLC_REGISTRY_LINK_TAG(updater_sync);
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@ -15,6 +15,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_colmaker);
|
||||
|
||||
/*! \brief column-wise update to construct a tree */
|
||||
template<typename TStats>
|
||||
class ColMaker: public TreeUpdater {
|
||||
@ -891,4 +894,3 @@ XGBOOST_REGISTER_TREE_UPDATER(DistColMaker, "distcol")
|
||||
});
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@ -15,6 +15,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_histmaker);
|
||||
|
||||
template<typename TStats>
|
||||
class HistMaker: public BaseMaker {
|
||||
public:
|
||||
|
||||
@ -14,6 +14,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_prune);
|
||||
|
||||
/*! \brief pruner that prunes a tree after growing finishes */
|
||||
class TreePruner: public TreeUpdater {
|
||||
public:
|
||||
|
||||
@ -14,6 +14,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_refresh);
|
||||
|
||||
/*! \brief pruner that prunes a tree after growing finishs */
|
||||
template<typename TStats>
|
||||
class TreeRefresher: public TreeUpdater {
|
||||
|
||||
@ -18,6 +18,8 @@
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_skmaker);
|
||||
|
||||
class SketchMaker: public BaseMaker {
|
||||
public:
|
||||
void Update(const std::vector<bst_gpair> &gpair,
|
||||
@ -399,4 +401,3 @@ XGBOOST_REGISTER_TREE_UPDATER(SketchMaker, "grow_skmaker")
|
||||
});
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@ -12,6 +12,9 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_sync);
|
||||
|
||||
/*!
|
||||
* \brief syncher that synchronize the tree in all distributed nodes
|
||||
* can implement various strategies, so far it is always set to node 0's tree
|
||||
|
||||
3
wrapper/.gitignore
vendored
3
wrapper/.gitignore
vendored
@ -1,3 +0,0 @@
|
||||
build
|
||||
dist
|
||||
*.egg*
|
||||
@ -1,9 +0,0 @@
|
||||
XGBoost Wrappers
|
||||
================
|
||||
This folder provides wrapper to create xgboost packages to other languages.
|
||||
|
||||
***Supported Language Packages***
|
||||
* [Python package](../python-package)
|
||||
* [R-package](../R-package)
|
||||
* [Java Package](../java)
|
||||
* [Julia Package](https://github.com/antinucleon/XGBoost.jl)
|
||||
@ -1,599 +0,0 @@
|
||||
// Copyright (c) 2014 by Contributors
|
||||
// implementations in ctypes
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <exception>
|
||||
// include all std functions
|
||||
using namespace std;
|
||||
#include "./xgboost_wrapper.h"
|
||||
#include "../src/data.h"
|
||||
#include "../src/learner/learner-inl.hpp"
|
||||
#include "../src/io/io.h"
|
||||
#include "../src/utils/utils.h"
|
||||
#include "../src/utils/math.h"
|
||||
#include "../src/utils/group_data.h"
|
||||
#include "../src/io/simple_dmatrix-inl.hpp"
|
||||
|
||||
using namespace xgboost;
|
||||
using namespace xgboost::io;
|
||||
|
||||
namespace xgboost {
|
||||
namespace wrapper {
|
||||
// booster wrapper class
|
||||
class Booster: public learner::BoostLearner {
|
||||
public:
|
||||
explicit Booster(const std::vector<DataMatrix*>& mats) {
|
||||
this->silent = 1;
|
||||
this->init_model = false;
|
||||
this->SetCacheData(mats);
|
||||
}
|
||||
inline const float *Pred(const DataMatrix &dmat, int option_mask,
|
||||
unsigned ntree_limit, bst_ulong *len) {
|
||||
this->CheckInitModel();
|
||||
this->Predict(dmat, (option_mask&1) != 0, &this->preds_,
|
||||
ntree_limit, (option_mask&2) != 0);
|
||||
*len = static_cast<bst_ulong>(this->preds_.size());
|
||||
return BeginPtr(this->preds_);
|
||||
}
|
||||
inline void BoostOneIter(const DataMatrix &train,
|
||||
float *grad, float *hess, bst_ulong len) {
|
||||
this->gpair_.resize(len);
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(len);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||
gpair_[j] = bst_gpair(grad[j], hess[j]);
|
||||
}
|
||||
gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
|
||||
}
|
||||
inline void CheckInitModel(void) {
|
||||
if (!init_model) {
|
||||
this->InitModel(); init_model = true;
|
||||
}
|
||||
}
|
||||
inline void LoadModel(const char *fname) {
|
||||
learner::BoostLearner::LoadModel(fname);
|
||||
this->init_model = true;
|
||||
}
|
||||
inline void LoadModelFromBuffer(const void *buf, size_t size) {
|
||||
utils::MemoryFixSizeBuffer fs((void*)buf, size); // NOLINT(*)
|
||||
learner::BoostLearner::LoadModel(fs, true);
|
||||
this->init_model = true;
|
||||
}
|
||||
inline const char *GetModelRaw(bst_ulong *out_len) {
|
||||
this->CheckInitModel();
|
||||
model_str.resize(0);
|
||||
utils::MemoryBufferStream fs(&model_str);
|
||||
learner::BoostLearner::SaveModel(fs, false);
|
||||
*out_len = static_cast<bst_ulong>(model_str.length());
|
||||
if (*out_len == 0) {
|
||||
return NULL;
|
||||
} else {
|
||||
return &model_str[0];
|
||||
}
|
||||
}
|
||||
inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) {
|
||||
model_dump = this->DumpModel(fmap, with_stats);
|
||||
model_dump_cptr.resize(model_dump.size());
|
||||
for (size_t i = 0; i < model_dump.size(); ++i) {
|
||||
model_dump_cptr[i] = model_dump[i].c_str();
|
||||
}
|
||||
*len = static_cast<bst_ulong>(model_dump.size());
|
||||
return BeginPtr(model_dump_cptr);
|
||||
}
|
||||
// temporal fields
|
||||
// temporal data to save evaluation dump
|
||||
std::string eval_str;
|
||||
// temporal data to save model dump
|
||||
std::string model_str;
|
||||
// temporal space to save model dump
|
||||
std::vector<std::string> model_dump;
|
||||
std::vector<const char*> model_dump_cptr;
|
||||
|
||||
private:
|
||||
bool init_model;
|
||||
};
|
||||
} // namespace wrapper
|
||||
} // namespace xgboost
|
||||
|
||||
using namespace xgboost::wrapper;
|
||||
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
namespace xgboost {
|
||||
namespace wrapper {
|
||||
// helper to support threadlocal
|
||||
struct ThreadLocalStore {
|
||||
std::vector<std::string*> data;
|
||||
// allocate a string
|
||||
inline std::string *Alloc() {
|
||||
mutex.Lock();
|
||||
data.push_back(new std::string());
|
||||
std::string *ret = data.back();
|
||||
mutex.Unlock();
|
||||
return ret;
|
||||
}
|
||||
ThreadLocalStore() {
|
||||
mutex.Init();
|
||||
}
|
||||
~ThreadLocalStore() {
|
||||
for (size_t i = 0; i < data.size(); ++i) {
|
||||
delete data[i];
|
||||
}
|
||||
mutex.Destroy();
|
||||
}
|
||||
utils::Mutex mutex;
|
||||
};
|
||||
|
||||
static ThreadLocalStore thread_local_store;
|
||||
} // namespace wrapper
|
||||
} // namespace xgboost
|
||||
|
||||
/*! \brief macro to guard beginning and end section of all functions */
|
||||
#define API_BEGIN() try {
|
||||
/*!
|
||||
* \brief every function starts with API_BEGIN(); and finishes with API_END();
|
||||
* \param Finalize optionally put in a finalizer
|
||||
*/
|
||||
#define API_END_FINALIZE(Finalize) } catch(std::exception &e) { \
|
||||
Finalize; return XGBHandleException(e); \
|
||||
} return 0;
|
||||
/*! \brief API End with no finalization */
|
||||
#define API_END() API_END_FINALIZE(;)
|
||||
|
||||
// do not use threadlocal on OSX since it is not always available
|
||||
#ifndef DISABLE_THREAD_LOCAL
|
||||
#ifdef __GNUC__
|
||||
#define XGB_TREAD_LOCAL __thread
|
||||
#elif __STDC_VERSION__ >= 201112L
|
||||
#define XGB_TREAD_LOCAL _Thread_local
|
||||
#elif defined(_MSC_VER)
|
||||
#define XGB_TREAD_LOCAL __declspec(thread)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef XGB_TREAD_LOCAL
|
||||
#pragma message("Warning: Threadlocal not enabled, used single thread error handling")
|
||||
#define XGB_TREAD_LOCAL
|
||||
#endif
|
||||
|
||||
/*!
|
||||
* \brief a helper function for error handling
|
||||
* will set the last error to be str_set when it is not NULL
|
||||
* \param str_set the error to set
|
||||
* \return a pointer message to last error
|
||||
*/
|
||||
const char *XGBSetGetLastError_(const char *str_set) {
|
||||
// use last_error to record last error
|
||||
static XGB_TREAD_LOCAL std::string *last_error = NULL;
|
||||
if (last_error == NULL) {
|
||||
last_error = thread_local_store.Alloc();
|
||||
}
|
||||
if (str_set != NULL) {
|
||||
*last_error = str_set;
|
||||
}
|
||||
return last_error->c_str();
|
||||
}
|
||||
#else
|
||||
// crippled implementation for solaris case
|
||||
// exception handling is not needed for R, so it is OK.
|
||||
#define API_BEGIN()
|
||||
#define API_END_FINALIZE(Finalize) return 0
|
||||
#define API_END() return 0
|
||||
|
||||
const char *XGBSetGetLastError_(const char *str_set) {
|
||||
return NULL;
|
||||
}
|
||||
#endif // XGBOOST_STRICT_CXX98_
|
||||
|
||||
/*! \brief return str message of the last error */
|
||||
const char *XGBGetLastError() {
|
||||
return XGBSetGetLastError_(NULL);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief handle exception throwed out
|
||||
* \param e the exception
|
||||
* \return the return value of API after exception is handled
|
||||
*/
|
||||
int XGBHandleException(const std::exception &e) {
|
||||
XGBSetGetLastError_(e.what());
|
||||
return -1;
|
||||
}
|
||||
|
||||
int XGDMatrixCreateFromFile(const char *fname,
|
||||
int silent,
|
||||
DMatrixHandle *out) {
|
||||
API_BEGIN();
|
||||
*out = LoadDataMatrix(fname, silent != 0, false, false);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixCreateFromCSR(const bst_ulong *indptr,
|
||||
const unsigned *indices,
|
||||
const float *data,
|
||||
bst_ulong nindptr,
|
||||
bst_ulong nelem,
|
||||
DMatrixHandle *out) {
|
||||
DMatrixSimple *p_mat = NULL;
|
||||
API_BEGIN();
|
||||
p_mat = new DMatrixSimple();
|
||||
DMatrixSimple &mat = *p_mat;
|
||||
mat.row_ptr_.resize(nindptr);
|
||||
for (bst_ulong i = 0; i < nindptr; ++i) {
|
||||
mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
|
||||
}
|
||||
mat.row_data_.resize(nelem);
|
||||
for (bst_ulong i = 0; i < nelem; ++i) {
|
||||
mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
|
||||
mat.info.info.num_col = std::max(mat.info.info.num_col,
|
||||
static_cast<size_t>(indices[i]+1));
|
||||
}
|
||||
mat.info.info.num_row = nindptr - 1;
|
||||
*out = p_mat;
|
||||
API_END_FINALIZE(delete p_mat);
|
||||
}
|
||||
|
||||
int XGDMatrixCreateFromCSC(const bst_ulong *col_ptr,
|
||||
const unsigned *indices,
|
||||
const float *data,
|
||||
bst_ulong nindptr,
|
||||
bst_ulong nelem,
|
||||
DMatrixHandle *out) {
|
||||
DMatrixSimple *p_mat = NULL;
|
||||
API_BEGIN();
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
p_mat = new DMatrixSimple();
|
||||
DMatrixSimple &mat = *p_mat;
|
||||
utils::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
|
||||
builder.InitBudget(0, nthread);
|
||||
long ncol = static_cast<long>(nindptr - 1); // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.AddBudget(indices[j], tid);
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < ncol; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.Push(indices[j],
|
||||
RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
mat.info.info.num_row = mat.row_ptr_.size() - 1;
|
||||
mat.info.info.num_col = static_cast<size_t>(ncol);
|
||||
*out = p_mat;
|
||||
API_END_FINALIZE(delete p_mat);
|
||||
}
|
||||
|
||||
int XGDMatrixCreateFromMat(const float *data,
|
||||
bst_ulong nrow,
|
||||
bst_ulong ncol,
|
||||
float missing,
|
||||
DMatrixHandle *out) {
|
||||
DMatrixSimple *p_mat = NULL;
|
||||
API_BEGIN();
|
||||
p_mat = new DMatrixSimple();
|
||||
bool nan_missing = utils::CheckNAN(missing);
|
||||
DMatrixSimple &mat = *p_mat;
|
||||
mat.info.info.num_row = nrow;
|
||||
mat.info.info.num_col = ncol;
|
||||
for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
|
||||
bst_ulong nelem = 0;
|
||||
for (bst_ulong j = 0; j < ncol; ++j) {
|
||||
if (utils::CheckNAN(data[j])) {
|
||||
utils::Check(nan_missing,
|
||||
"There are NAN in the matrix, however, you did not set missing=NAN");
|
||||
} else {
|
||||
if (nan_missing || data[j] != missing) {
|
||||
mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
|
||||
++nelem;
|
||||
}
|
||||
}
|
||||
}
|
||||
mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem);
|
||||
}
|
||||
*out = p_mat;
|
||||
API_END_FINALIZE(delete p_mat);
|
||||
}
|
||||
|
||||
int XGDMatrixSliceDMatrix(DMatrixHandle handle,
|
||||
const int *idxset,
|
||||
bst_ulong len,
|
||||
DMatrixHandle *out) {
|
||||
DMatrixSimple *p_ret = NULL;
|
||||
API_BEGIN();
|
||||
DMatrixSimple tmp;
|
||||
DataMatrix &dsrc = *static_cast<DataMatrix*>(handle);
|
||||
if (dsrc.magic != DMatrixSimple::kMagic) {
|
||||
tmp.CopyFrom(dsrc);
|
||||
}
|
||||
DataMatrix &src = (dsrc.magic == DMatrixSimple::kMagic ?
|
||||
*static_cast<DMatrixSimple*>(handle): tmp);
|
||||
p_ret = new DMatrixSimple();
|
||||
DMatrixSimple &ret = *p_ret;
|
||||
|
||||
utils::Check(src.info.group_ptr.size() == 0,
|
||||
"slice does not support group structure");
|
||||
ret.Clear();
|
||||
ret.info.info.num_row = len;
|
||||
ret.info.info.num_col = src.info.num_col();
|
||||
|
||||
utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
utils::Assert(iter->Next(), "slice");
|
||||
const RowBatch &batch = iter->Value();
|
||||
for (bst_ulong i = 0; i < len; ++i) {
|
||||
const int ridx = idxset[i];
|
||||
RowBatch::Inst inst = batch[ridx];
|
||||
utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows");
|
||||
ret.row_data_.resize(ret.row_data_.size() + inst.length);
|
||||
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
|
||||
sizeof(RowBatch::Entry) * inst.length);
|
||||
ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
|
||||
if (src.info.labels.size() != 0) {
|
||||
ret.info.labels.push_back(src.info.labels[ridx]);
|
||||
}
|
||||
if (src.info.weights.size() != 0) {
|
||||
ret.info.weights.push_back(src.info.weights[ridx]);
|
||||
}
|
||||
if (src.info.info.root_index.size() != 0) {
|
||||
ret.info.info.root_index.push_back(src.info.info.root_index[ridx]);
|
||||
}
|
||||
if (src.info.info.fold_index.size() != 0) {
|
||||
ret.info.info.fold_index.push_back(src.info.info.fold_index[ridx]);
|
||||
}
|
||||
}
|
||||
*out = p_ret;
|
||||
API_END_FINALIZE(delete p_ret);
|
||||
}
|
||||
|
||||
int XGDMatrixFree(DMatrixHandle handle) {
|
||||
API_BEGIN();
|
||||
delete static_cast<DataMatrix*>(handle);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixSaveBinary(DMatrixHandle handle,
|
||||
const char *fname,
|
||||
int silent) {
|
||||
API_BEGIN();
|
||||
SaveDataMatrix(*static_cast<DataMatrix*>(handle), fname, silent != 0);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixSetFloatInfo(DMatrixHandle handle,
|
||||
const char *field,
|
||||
const float *info,
|
||||
bst_ulong len) {
|
||||
API_BEGIN();
|
||||
std::vector<float> &vec =
|
||||
static_cast<DataMatrix*>(handle)->info.GetFloatInfo(field);
|
||||
vec.resize(len);
|
||||
memcpy(BeginPtr(vec), info, sizeof(float) * len);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixSetUIntInfo(DMatrixHandle handle,
|
||||
const char *field,
|
||||
const unsigned *info,
|
||||
bst_ulong len) {
|
||||
API_BEGIN();
|
||||
std::vector<unsigned> &vec =
|
||||
static_cast<DataMatrix*>(handle)->info.GetUIntInfo(field);
|
||||
vec.resize(len);
|
||||
memcpy(BeginPtr(vec), info, sizeof(unsigned) * len);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixSetGroup(DMatrixHandle handle,
|
||||
const unsigned *group,
|
||||
bst_ulong len) {
|
||||
API_BEGIN();
|
||||
DataMatrix *pmat = static_cast<DataMatrix*>(handle);
|
||||
pmat->info.group_ptr.resize(len + 1);
|
||||
pmat->info.group_ptr[0] = 0;
|
||||
for (uint64_t i = 0; i < len; ++i) {
|
||||
pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i] + group[i];
|
||||
}
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
|
||||
const char *field,
|
||||
bst_ulong *out_len,
|
||||
const float **out_dptr) {
|
||||
API_BEGIN();
|
||||
const std::vector<float> &vec =
|
||||
static_cast<const DataMatrix*>(handle)->info.GetFloatInfo(field);
|
||||
*out_len = static_cast<bst_ulong>(vec.size());
|
||||
*out_dptr = BeginPtr(vec);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
|
||||
const char *field,
|
||||
bst_ulong *out_len,
|
||||
const unsigned **out_dptr) {
|
||||
API_BEGIN();
|
||||
const std::vector<unsigned> &vec =
|
||||
static_cast<const DataMatrix*>(handle)->info.GetUIntInfo(field);
|
||||
*out_len = static_cast<bst_ulong>(vec.size());
|
||||
*out_dptr = BeginPtr(vec);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixNumRow(const DMatrixHandle handle,
|
||||
bst_ulong *out) {
|
||||
API_BEGIN();
|
||||
*out = static_cast<bst_ulong>(static_cast<const DataMatrix*>(handle)->info.num_row());
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGDMatrixNumCol(const DMatrixHandle handle,
|
||||
bst_ulong *out) {
|
||||
API_BEGIN();
|
||||
*out = static_cast<size_t>(static_cast<const DataMatrix*>(handle)->info.num_col());
|
||||
API_END();
|
||||
}
|
||||
|
||||
// xgboost implementation
|
||||
int XGBoosterCreate(DMatrixHandle dmats[],
|
||||
bst_ulong len,
|
||||
BoosterHandle *out) {
|
||||
API_BEGIN();
|
||||
std::vector<DataMatrix*> mats;
|
||||
for (bst_ulong i = 0; i < len; ++i) {
|
||||
DataMatrix *dtr = static_cast<DataMatrix*>(dmats[i]);
|
||||
mats.push_back(dtr);
|
||||
}
|
||||
*out = new Booster(mats);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterFree(BoosterHandle handle) {
|
||||
API_BEGIN();
|
||||
delete static_cast<Booster*>(handle);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterSetParam(BoosterHandle handle,
|
||||
const char *name, const char *value) {
|
||||
API_BEGIN();
|
||||
static_cast<Booster*>(handle)->SetParam(name, value);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterUpdateOneIter(BoosterHandle handle,
|
||||
int iter,
|
||||
DMatrixHandle dtrain) {
|
||||
API_BEGIN();
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
|
||||
bst->CheckInitModel();
|
||||
bst->CheckInit(dtr);
|
||||
bst->UpdateOneIter(iter, *dtr);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterBoostOneIter(BoosterHandle handle,
|
||||
DMatrixHandle dtrain,
|
||||
float *grad,
|
||||
float *hess,
|
||||
bst_ulong len) {
|
||||
API_BEGIN();
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
|
||||
bst->CheckInitModel();
|
||||
bst->CheckInit(dtr);
|
||||
bst->BoostOneIter(*dtr, grad, hess, len);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterEvalOneIter(BoosterHandle handle,
|
||||
int iter,
|
||||
DMatrixHandle dmats[],
|
||||
const char *evnames[],
|
||||
bst_ulong len,
|
||||
const char **out_str) {
|
||||
API_BEGIN();
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
std::vector<std::string> names;
|
||||
std::vector<const DataMatrix*> mats;
|
||||
for (bst_ulong i = 0; i < len; ++i) {
|
||||
mats.push_back(static_cast<DataMatrix*>(dmats[i]));
|
||||
names.push_back(std::string(evnames[i]));
|
||||
}
|
||||
bst->CheckInitModel();
|
||||
bst->eval_str = bst->EvalOneIter(iter, mats, names);
|
||||
*out_str = bst->eval_str.c_str();
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterPredict(BoosterHandle handle,
|
||||
DMatrixHandle dmat,
|
||||
int option_mask,
|
||||
unsigned ntree_limit,
|
||||
bst_ulong *len,
|
||||
const float **out_result) {
|
||||
API_BEGIN();
|
||||
*out_result = static_cast<Booster*>(handle)->
|
||||
Pred(*static_cast<DataMatrix*>(dmat),
|
||||
option_mask, ntree_limit, len);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterLoadModel(BoosterHandle handle, const char *fname) {
|
||||
API_BEGIN();
|
||||
static_cast<Booster*>(handle)->LoadModel(fname);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterSaveModel(BoosterHandle handle, const char *fname) {
|
||||
API_BEGIN();
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
bst->CheckInitModel();
|
||||
bst->SaveModel(fname, false);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
|
||||
const void *buf,
|
||||
bst_ulong len) {
|
||||
API_BEGIN();
|
||||
static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterGetModelRaw(BoosterHandle handle,
|
||||
bst_ulong *out_len,
|
||||
const char **out_dptr) {
|
||||
API_BEGIN();
|
||||
*out_dptr = static_cast<Booster*>(handle)->GetModelRaw(out_len);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterDumpModel(BoosterHandle handle,
|
||||
const char *fmap,
|
||||
int with_stats,
|
||||
bst_ulong *len,
|
||||
const char ***out_models) {
|
||||
API_BEGIN();
|
||||
utils::FeatMap featmap;
|
||||
if (strlen(fmap) != 0) {
|
||||
featmap.LoadText(fmap);
|
||||
}
|
||||
*out_models = static_cast<Booster*>(handle)->GetModelDump(
|
||||
featmap, with_stats != 0, len);
|
||||
API_END();
|
||||
}
|
||||
|
||||
int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
|
||||
int fnum,
|
||||
const char **fname,
|
||||
const char **ftype,
|
||||
int with_stats,
|
||||
bst_ulong *len,
|
||||
const char ***out_models) {
|
||||
API_BEGIN();
|
||||
utils::FeatMap featmap;
|
||||
for (int i = 0; i < fnum; ++i) {
|
||||
featmap.PushBack(i, fname[i], ftype[i]);
|
||||
}
|
||||
*out_models = static_cast<Booster*>(handle)->GetModelDump(
|
||||
featmap, with_stats != 0, len);
|
||||
API_END();
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user