This commit is contained in:
tqchen 2015-07-03 19:35:23 -07:00
parent aba41d07cd
commit 1123253f79
10 changed files with 178 additions and 143 deletions

View File

@ -1,10 +1,12 @@
#ifndef XGBOOST_DATA_H
#define XGBOOST_DATA_H
/*!
* Copyright (c) 2014 by Contributors
* \file data.h
* \brief the input data structure for gradient boosting
* \author Tianqi Chen
*/
#ifndef XGBOOST_DATA_H_
#define XGBOOST_DATA_H_
#include <cstdio>
#include <vector>
#include "utils/utils.h"
@ -161,4 +163,4 @@ class IFMatrix {
virtual ~IFMatrix(void){}
};
} // namespace xgboost
#endif // XGBOOST_DATA_H
#endif // XGBOOST_DATA_H_

View File

@ -1,6 +1,8 @@
// Copyright by Contributors
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX
#include <string>
#include "../utils/io.h"
// implements a single no split version of DMLC
@ -154,7 +156,7 @@ class StdFile : public dmlc::Stream {
std::fwrite(ptr, size, 1, fp);
}
virtual void Seek(size_t pos) {
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
std::fseek(fp, static_cast<long>(pos), SEEK_SET); // NOLINT(*)
}
virtual size_t Tell(void) {
return std::ftell(fp);

View File

@ -1,3 +1,4 @@
// Copyright 2014 by Contributors
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX

View File

@ -35,7 +35,7 @@ struct LibSVMPage : public SparsePage {
*/
class LibSVMPageFactory {
public:
explicit LibSVMPageFactory()
LibSVMPageFactory()
: bytes_read_(0), at_head_(true) {
}
inline bool Init(void) {
@ -199,6 +199,7 @@ class LibSVMParser : public utils::IIterator<LibSVMPage> {
inline size_t bytes_read(void) const {
return itr.get_factory().bytes_read();
}
private:
bool at_end_;
size_t data_ptr_;

View File

@ -1,11 +1,15 @@
#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
/*!
* Copyright (c) 2014 by Contributors
* \file page_dmatrix-inl.hpp
* row iterator based on sparse page
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
#include <vector>
#include <string>
#include <algorithm>
#include "../data.h"
#include "../utils/iterator.h"
#include "../utils/thread_buffer.h"
@ -94,12 +98,12 @@ class DMatrixPageBase : public DataMatrix {
fbin.Close();
if (!silent) {
utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
static_cast<unsigned long>(mat.info.num_row()),
static_cast<unsigned long>(mat.info.num_col()), fname_);
static_cast<unsigned long>(mat.info.num_row()), // NOLINT(*)
static_cast<unsigned long>(mat.info.num_col()), fname_); // NOLINT(*)
}
}
/*! \brief load and initialize the iterator with fi */
inline void LoadBinary(utils::FileStream &fi,
inline void LoadBinary(utils::FileStream &fi, // NOLINT(*)
bool silent,
const char *fname_) {
this->set_cache_file(fname_);
@ -114,8 +118,8 @@ class DMatrixPageBase : public DataMatrix {
iter_->Load(fs);
if (!silent) {
utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()));
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col())); // NOLINT(*)
if (fname_ != NULL) {
utils::Printf(" from %s\n", fname_);
} else {
@ -188,8 +192,8 @@ class DMatrixPageBase : public DataMatrix {
fs.Close();
if (!silent) {
utils::Printf("DMatrixPage: %lux%lu is parsed from %s\n",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
uri);
}
}

View File

@ -1,10 +1,16 @@
#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
/*!
* Copyright (c) 2014 by Contributors
* \file page_fmatrix-inl.hpp
* col iterator based on sparse page
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
#include <vector>
#include <string>
#include <algorithm>
namespace xgboost {
namespace io {
/*! \brief thread buffer iterator */

View File

@ -1,6 +1,5 @@
#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
/*!
* Copyright 2014 by Contributors
* \file simple_dmatrix-inl.hpp
* \brief simple implementation of DMatrixS that can be used
* the data format of xgboost is templatized, which means it can accept
@ -8,6 +7,9 @@
* this file is a specific implementation of input data structure that can be used by BoostLearner
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
#include <string>
#include <cstring>
#include <vector>
@ -123,9 +125,9 @@ class DMatrixSimple : public DataMatrix {
}
if (!silent) {
utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()), uri);
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
static_cast<unsigned long>(row_data_.size()), uri); // NOLINT(*)
}
// try to load in additional file
if (!loadsplit) {
@ -165,10 +167,11 @@ class DMatrixSimple : public DataMatrix {
* \param silent whether print information during loading
* \param fname file name, used to print message
*/
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) { // NOLINT(*)
int tmagic;
utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname);
utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch",
fname == NULL ? "" : fname);
info.LoadBinary(fs);
LoadBinary(fs, &row_ptr_, &row_data_);
@ -176,9 +179,9 @@ class DMatrixSimple : public DataMatrix {
if (!silent) {
utils::Printf("%lux%lu matrix with %lu entries is loaded",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()));
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
static_cast<unsigned long>(row_data_.size())); // NOLINT(*)
if (fname != NULL) {
utils::Printf(" from %s\n", fname);
} else {
@ -205,9 +208,9 @@ class DMatrixSimple : public DataMatrix {
if (!silent) {
utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()), fname);
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
static_cast<unsigned long>(row_data_.size()), fname); // NOLINT(*)
if (info.group_ptr.size() != 0) {
utils::Printf("data contains %u groups\n",
static_cast<unsigned>(info.group_ptr.size()-1));
@ -256,7 +259,7 @@ class DMatrixSimple : public DataMatrix {
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
inline static void SaveBinary(utils::IStream &fo, // NOLINT(*)
const std::vector<size_t> &ptr,
const std::vector<RowBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
@ -272,7 +275,7 @@ class DMatrixSimple : public DataMatrix {
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
inline static void LoadBinary(utils::IStream &fi, // NOLINT(*)
std::vector<size_t> *out_ptr,
std::vector<RowBatch::Entry> *out_data) {
size_t nrow;

View File

@ -1,11 +1,15 @@
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
/*!
* Copyright 2014 by Contributors
* \file simple_fmatrix-inl.hpp
* \brief the input data structure for gradient boosting
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
#include <limits>
#include <algorithm>
#include <vector>
#include "../data.h"
#include "../utils/utils.h"
#include "../utils/random.h"
@ -94,7 +98,7 @@ class FMatrixS : public IFMatrix {
* \brief save column access data into stream
* \param fo output stream to save to
*/
inline void SaveColAccess(utils::IStream &fo) const {
inline void SaveColAccess(utils::IStream &fo) const { // NOLINT(*)
size_t n = 0;
fo.Write(&n, sizeof(n));
}
@ -102,7 +106,7 @@ class FMatrixS : public IFMatrix {
* \brief load column access data from stream
* \param fo output stream to load from
*/
inline void LoadColAccess(utils::IStream &fi) {
inline void LoadColAccess(utils::IStream &fi) { // NOLINT(*)
// do nothing in load col access
}
@ -159,8 +163,8 @@ class FMatrixS : public IFMatrix {
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
bmap.resize(bmap.size() + batch.size, true);
long batch_size = static_cast<long>(batch.size);
for (long i = 0; i < batch_size; ++i) {
long batch_size = static_cast<long>(batch.size); // NOLINT(*)
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(ridx);
@ -169,7 +173,7 @@ class FMatrixS : public IFMatrix {
}
}
#pragma omp parallel for schedule(static)
for (long i = 0; i < batch_size; ++i) {
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
if (bmap[ridx]) {
@ -188,7 +192,7 @@ class FMatrixS : public IFMatrix {
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
#pragma omp parallel for schedule(static)
for (long i = 0; i < static_cast<long>(batch.size); ++i) {
for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
if (bmap[ridx]) {
@ -367,4 +371,4 @@ class FMatrixS : public IFMatrix {
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP
#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP_

View File

@ -1,12 +1,16 @@
#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
/*!
* Copyright (c) 2014 by Contributors
* \file sparse_batch_page.h
* content holder of sparse batch that can be saved to disk
* the representation can be effectively
* use in external memory computation
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
#include <vector>
#include <algorithm>
#include "../data.h"
namespace xgboost {

View File

@ -1,14 +1,16 @@
// Copyright 2014 by Contributors
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX
#include <ctime>
#include <string>
#include <cstring>
#include <vector>
#include "./sync/sync.h"
#include "io/io.h"
#include "utils/utils.h"
#include "utils/config.h"
#include "learner/learner-inl.hpp"
#include "./io/io.h"
#include "./utils/utils.h"
#include "./utils/config.h"
#include "./learner/learner-inl.hpp"
namespace xgboost {
/*!
@ -90,12 +92,14 @@ class BoostLearnTask {
if (!strcmp("save_pbuffer", name)) save_with_pbuffer = atoi(val);
if (!strncmp("eval[", name, 5)) {
char evname[256];
utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display");
utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1,
"must specify evaluation name for display");
eval_data_names.push_back(std::string(evname));
eval_data_paths.push_back(std::string(val));
}
learner.SetParam(name, val);
}
public:
BoostLearnTask(void) {
// default parameters
@ -125,6 +129,7 @@ class BoostLearnTask {
}
if (data != NULL) delete data;
}
private:
inline void InitData(void) {
if (strchr(train_path.c_str(), '%') != NULL) {
@ -178,12 +183,12 @@ class BoostLearnTask {
int version = rabit::LoadCheckPoint(&learner);
if (version == 0) this->InitLearner();
const time_t start = time(NULL);
unsigned long elapsed = 0;
unsigned long elapsed = 0; // NOLINT(*)
learner.CheckInit(data);
bool allow_lazy = learner.AllowLazyCheckPoint();
for (int i = version / 2; i < num_round; ++i) {
elapsed = (unsigned long)(time(NULL) - start);
elapsed = (unsigned long)(time(NULL) - start); // NOLINT(*)
if (version % 2 == 0) {
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
learner.UpdateOneIter(i, *data);
@ -215,7 +220,7 @@ class BoostLearnTask {
}
version += 1;
utils::Assert(version == rabit::VersionNumber(), "consistent check");
elapsed = (unsigned long)(time(NULL) - start);
elapsed = (unsigned long)(time(NULL) - start); // NOLINT(*)
}
// always save final round
if ((save_period == 0 || num_round % save_period != 0) && model_out != "NONE") {
@ -247,7 +252,8 @@ class BoostLearnTask {
}
inline void SaveModel(int i) const {
char fname[256];
sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1);
utils::SPrintf(fname, sizeof(fname),
"%s/%04d.model", model_dir_path.c_str(), i + 1);
this->SaveModel(fname);
}
inline void TaskPred(void) {
@ -266,6 +272,7 @@ class BoostLearnTask {
}
if (fo != stdout) fclose(fo);
}
private:
/*! \brief whether silent */
int silent;
@ -309,6 +316,7 @@ class BoostLearnTask {
std::vector<std::string> eval_data_paths;
/*! \brief the names of the evaluation data used in output log */
std::vector<std::string> eval_data_names;
private:
io::DataMatrix* data;
std::vector<io::DataMatrix*> deval;
@ -316,7 +324,7 @@ class BoostLearnTask {
utils::FeatMap fmap;
learner::BoostLearner learner;
};
}
} // namespace xgboost
int main(int argc, char *argv[]) {
xgboost::BoostLearnTask tsk;