/*! * Copyright (c) 2015 by Contributors * \file data.h * \brief simple data structure that could be used by model * * \author Tianqi Chen */ #ifndef RABIT_LEARN_DATA_H_ #define RABIT_LEARN_DATA_H_ #include #include #include #include #include #include #include #include #include "../io/io.h" namespace rabit { // typedef index type typedef unsigned index_t; /*! \brief sparse matrix, CSR format */ struct SparseMat { // sparse matrix entry struct Entry { // feature index index_t findex; // feature value float fvalue; }; // sparse vector struct Vector { const Entry *data; index_t length; inline const Entry &operator[](size_t i) const { return data[i]; } }; inline Vector operator[](size_t i) const { Vector v; v.data = &data[0] + row_ptr[i]; v.length = static_cast(row_ptr[i + 1]-row_ptr[i]); return v; } // load data from LibSVM format inline void Load(const char *fname) { io::InputSplit *in = io::CreateInputSplit (fname, rabit::GetRank(), rabit::GetWorldSize()); row_ptr.clear(); row_ptr.push_back(0); data.clear(); feat_dim = 0; std::string line; while (in->NextLine(&line)) { float label; std::istringstream ss(line); ss >> label; Entry e; unsigned long fidx; while (!ss.eof()) { if (!(ss >> fidx)) break; ss.ignore(32, ':'); if (!(ss >> e.fvalue)) break; e.findex = static_cast(fidx); data.push_back(e); feat_dim = std::max(fidx, feat_dim); } labels.push_back(label); row_ptr.push_back(data.size()); } delete in; feat_dim += 1; utils::Check(feat_dim < std::numeric_limits::max(), "feature dimension exceed limit of index_t"\ "consider change the index_t to unsigned long"); } inline size_t NumRow(void) const { return row_ptr.size() - 1; } // memory cost inline size_t MemCost(void) const { return data.size() * sizeof(Entry); } // maximum feature dimension size_t feat_dim; std::vector row_ptr; std::vector data; std::vector labels; }; // dense matrix struct Matrix { inline void Init(size_t nrow, size_t ncol, float v = 0.0f) { this->nrow = nrow; this->ncol = ncol; data.resize(nrow * ncol); std::fill(data.begin(), data.end(), v); } inline float *operator[](size_t i) { return &data[0] + i * ncol; } inline const float *operator[](size_t i) const { return &data[0] + i * ncol; } inline void Print(const char *fname) { FILE *fo; if (!strcmp(fname, "stdout")) { fo = stdout; } else { fo = utils::FopenCheck(fname, "w"); } for (size_t i = 0; i < data.size(); ++i) { fprintf(fo, "%g", data[i]); if ((i+1) % ncol == 0) { fprintf(fo, "\n"); } else { fprintf(fo, " "); } } // close the filed if (fo != stdout) fclose(fo); } // number of data size_t nrow, ncol; std::vector data; }; /*!\brief computes a random number modulo the value */ inline int Random(int value) { return rand() % value; } } // namespace rabit #endif // RABIT_LEARN_DATA_H_