diff --git a/src/utils/quantile.h b/src/utils/quantile.h index 0fadd9c35..e51dc16ba 100644 --- a/src/utils/quantile.h +++ b/src/utils/quantile.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "./utils.h" namespace xgboost { @@ -100,7 +101,6 @@ struct WQSummary { } } }; - /*! \brief data field */ Entry *data; /*! \brief number of elements in the summary */ @@ -123,19 +123,15 @@ struct WQSummary { inline RType MaxRank(void) const { return data[size - 1].rmax; } - /*! \brief set size to 0 */ - inline void Clear(void) { - size = 0; - } /*! * \brief copy content from src * \param src source sketch */ inline void CopyFrom(const WQSummary &src) { size = src.size; - std::memcpy(data, src.data, sizeof(Entry) * size); - -} /*! + std::memcpy(data, src.data, sizeof(Entry) * size); + } + /*! * \brief debug function, validate whether the summary * run consistency check to check if it is a valid summary * \param eps the tolerate error level, used when RType is floating point and @@ -153,9 +149,9 @@ struct WQSummary { /*! \brief used for debug purpose, print the summary */ inline void Print(void) const { for (size_t i = 0; i < size; ++i) { - printf("x=%f\t[%f, %f] wmin=%f\n", - data[i].value, data[i].rmin, - data[i].rmax, data[i].wmin); + std::cout << "x=" << data[i].value << "\t" + << "[" << data[i].rmin << "," << data[i].rmax << "]" + << " wmin=" << data[i].wmin << std::endl; } } /*! @@ -250,6 +246,147 @@ struct WQSummary { } }; +/*! + * \brief traditional GK summary + */ +template +struct GKSummary { + /*! \brief an entry in the sketch summary */ + struct Entry { + /*! \brief minimum rank */ + RType rmin; + /*! \brief maximum rank */ + RType rmax; + /*! \brief the value of data */ + DType value; + // constructor + Entry(void) {} + // constructor + Entry(RType rmin, RType rmax, DType value) + : rmin(rmin), rmax(rmax), value(value) {} + }; + /*! \brief input data queue before entering the summary */ + struct Queue { + // the input queue + std::vector queue; + // end of the queue + size_t qtail; + // push data to the queue + inline void Push(DType x, RType w) { + queue[qtail++] = x; + } + inline void MakeSummary(GKSummary *out) { + std::sort(queue.begin(), queue.begin() + qtail); + out->size = qtail; + for (size_t i = 0; i < qtail; ++i) { + out->data[i] = Entry(i + 1, i + 1, queue[i]); + } + } + }; + /*! \brief data field */ + Entry *data; + /*! \brief number of elements in the summary */ + size_t size; + GKSummary(Entry *data, size_t size) + : data(data), size(size) {} + /*! \brief the maximum error of the summary */ + inline RType MaxError(void) const { + RType res = 0; + for (size_t i = 1; i < size; ++i) { + res = std::max(data[i].rmax - data[i-1].rmin, res); + } + return res; + } + /*! \return maximum rank in the summary */ + inline RType MaxRank(void) const { + return data[size - 1].rmax; + } + /*! + * \brief copy content from src + * \param src source sketch + */ + inline void CopyFrom(const GKSummary &src) { + size = src.size; + std::memcpy(data, src.data, sizeof(Entry) * size); + } + inline void CheckValid(RType eps) const { + // assume always valid + } + /*! \brief used for debug purpose, print the summary */ + inline void Print(void) const { + for (size_t i = 0; i < size; ++i) { + std::cout << "x=" << data[i].value << "\t" + << "[" << data[i].rmin << "," << data[i].rmax << "]" + << std::endl; + } + } + /*! + * \brief set current summary to be pruned summary of src + * assume data field is already allocated to be at least maxsize + * \param src source summary + * \param maxsize size we can afford in the pruned sketch + */ + inline void SetPrune(const GKSummary &src, RType maxsize) { + if (src.size <= maxsize) { + this->CopyFrom(src); return; + } + const RType max_rank = src.MaxRank(); + this->size = maxsize; + data[0] = src.data[0]; + size_t n = maxsize - 1; + RType top = 1; + for (size_t i = 1; i < n; ++i) { + RType k = (i * max_rank) / n; + while (k > src.data[top + 1].rmax) ++top; + // assert src.data[top].rmin <= k + // because k > src.data[top].rmax >= src.data[top].rmin + if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) { + data[i] = src.data[top]; + } else { + data[i] = src.data[top + 1]; + } + } + data[n] = src.data[src.size - 1]; + } + inline void SetCombine(const GKSummary &sa, + const GKSummary &sb) { + utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); + const Entry *a = sa.data, *a_end = sa.data + sa.size; + const Entry *b = sb.data, *b_end = sb.data + sb.size; + this->size = sa.size + sb.size; + RType aprev_rmin = 0, bprev_rmin = 0; + Entry *dst = this->data; + while (a != a_end && b != b_end) { + if (a->value < b->value) { + *dst = Entry(bprev_rmin + a->rmin, + a->rmax + b->rmax - 1, a->value); + aprev_rmin = a->rmin; + ++dst; ++a; + } else { + *dst = Entry(aprev_rmin + b->rmin, + b->rmax + a->rmax - 1, b->value); + bprev_rmin = b->rmin; + ++dst; ++b; + } + } + if (a != a_end) { + RType bprev_rmax = (b_end - 1)->rmax; + do { + *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value); + ++dst; ++a; + } while (a != a_end); + } + if (b != b_end) { + RType aprev_rmax = (a_end - 1)->rmax; + do { + *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value); + ++dst; ++b; + } while (b != b_end); + } + utils::Assert(dst == data + size, "bug in combine"); + } +}; + /*! * \brief template for all quantle sketch algorithm * that uses merge/prune scheme @@ -418,7 +555,7 @@ class QuantileSketchTemplate { }; /*! - * \brief Quantiel sketch use WQSummary + * \brief Quantile sketch use WQSummary * \tparam DType type of data content * \tparam RType type of rank */ @@ -426,262 +563,16 @@ template class WQuantileSketch : public QuantileSketchTemplate >{ }; - /*! - * \brief a helper class to compute streaming quantile + * \brief Quantile sketch use WQSummary * \tparam DType type of data content * \tparam RType type of rank */ template -class GKQuantileSketch { - public: - /*! \brief an entry in the sketch summary */ - struct Entry { - /*! \brief minimum rank */ - RType rmin; - /*! \brief maximum rank */ - RType rmax; - /*! \brief the value of data */ - DType value; - // constructor - Entry(void) {} - // constructor - Entry(RType rmin, RType rmax, DType value) - : rmin(rmin), rmax(rmax), value(value) {} - }; - /*! - * \brief this is data structure presenting one summary - */ - struct Summary { - /*! \brief data field */ - Entry *data; - /*! \brief number of elements in the summary */ - RType size; - /*! \brief the maximum error of the summary */ - inline RType MaxError(void) const { - RType res = 0; - for (size_t i = 1; i < size; ++i) { - res = std::max(data[i].rmax - data[i-1].rmin, res); - } - return res; - } - /*! \return maximum rank in the summary */ - inline RType MaxRank(void) const { - return data[size - 1].rmax; - } - /*! \brief set size to 0 */ - inline void Clear(void) { - size = 0; - } - /*! - * \brief copy content from src - * \param src source sketch - */ - inline void CopyFrom(const Summary &src) { - size = src.size; - std::memcpy(data, src.data, sizeof(Entry) * size); - } - /*! - * \brief set current summary to be pruned summary of src - * assume data field is already allocated to be at least maxsize - * \param src source summary - * \param maxsize size we can afford in the pruned sketch - */ - inline void SetPrune(const Summary &src, RType maxsize) { - const RType max_rank = src.MaxRank(); - this->size = maxsize; - data[0] = src.data[0]; - RType n = maxsize - 1; - RType top = 1; - for (size_t i = 1; i < n; ++i) { - RType k = (i * max_rank) / n; - while (k > src.data[top + 1].rmax) ++top; - // assert src.data[top].rmin <= k - // because k > src.data[top].rmax >= src.data[top].rmin - if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) { - data[i] = src.data[top]; - } else { - data[i] = src.data[top + 1]; - } - } - data[n] = src.data[src.size - 1]; - } - inline void SetCombine(const Summary &sa, - const Summary &sb) { - utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge"); - const Entry *a = sa.data, *a_end = sa.data + sa.size; - const Entry *b = sb.data, *b_end = sb.data + sb.size; - this->size = sa.size + sb.size; - RType aprev_rmin = 0, bprev_rmin = 0; - Entry *dst = this->data; - while (a != a_end && b != b_end) { - if (a->value < b->value) { - *dst = Entry(bprev_rmin + a->rmin, - a->rmax + b->rmax - 1, a->value); - aprev_rmin = a->rmin; - ++dst; ++a; - } else { - *dst = Entry(aprev_rmin + b->rmin, - b->rmax + a->rmax - 1, b->value); - bprev_rmin = b->rmin; - ++dst; ++b; - } - } - if (a != a_end) { - RType bprev_rmax = (b_end - 1)->rmax; - do { - *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value); - ++dst; ++a; - } while (a != a_end); - } - if (b != b_end) { - RType aprev_rmax = (a_end - 1)->rmax; - do { - *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value); - ++dst; ++b; - } while (b != b_end); - } - utils::Assert(dst == data + size, "bug in combine"); - } - }; - // same as summary, but use STL to backup the space - struct SummaryContainer : public Summary { - std::vector space; - /*! \brief reserve space for summary */ - inline void Reserve(size_t size) { - space.resize(size); - this->data = BeginPtr(space); - } - /*! - * \brief set the space to be merge of all Summary arrays - * \param begin begining position in th summary array - * \param end ending position in the Summary array - */ - inline void SetMerge(const Summary *begin, - const Summary *end) { - utils::Assert(begin < end, "can not set combine to empty instance"); - size_t len = end - begin; - if (len == 1) { - this->Reserve(begin[0].size); - this->CopyFrom(begin[0]); - } else if (len == 2) { - this->Reserve(begin[0].size + begin[1].size); - this->SetMerge(begin[0], begin[1]); - } else { - // recursive merge - SummaryContainer lhs, rhs; - lhs.SetCombine(begin, begin + len / 2); - rhs.SetCombine(begin + len / 2, end); - this->Reserve(lhs.size + rhs.size); - this->SetCombine(lhs, rhs); - } - } - }; - /*! - * \brief intialize the quantile sketch, given the performance specification - * \param maxn maximum number of data points can be encountered - * \param eps accuracy level of summary - */ - inline void Init(RType maxn, double eps) { - eps = eps * 0.5; - size_t L = 0; - size_t b = std::max(floor(log2(eps * maxn) / eps), 8.0); - // check for lower - while (b < maxn) { - L = ceil(log2(maxn / b)) + 1; - if (L < eps * b) break; - ++b; - } - L += 1; - inqueue.resize(b); - limit_size = (b + 1) / 2 + 1; - temp.Reserve(limit_size * 2); - data.resize(limit_size * L); - for (size_t l = 0; l < L; ++l) { - Summary s; s.size = 0; - s.data = BeginPtr(data) + l * limit_size; - level.push_back(s); - } - qtail = 0; - } - /*! - * \brief add an element to a sketch - * \param x the elemented added to the sketch - */ - inline void Push(DType x) { - inqueue[qtail++] = x; - if (qtail == inqueue.size()) { - // start update sketch - std::sort(inqueue.begin(), inqueue.end()); - for (size_t i = 0; i < qtail; ++i) { - temp.data[i] = Entry(i + 1, i + 1, inqueue[i]); - } - temp.size = static_cast(qtail); - // clean up queue - qtail = 0; - for (size_t l = 1; l < level.size(); ++l) { - // check if level l is empty - if (level[l].size == 0) { - level[l].SetPrune(temp, limit_size); - return; - } else { - // level 0 is actually temp space - level[0].SetPrune(temp, limit_size); - temp.SetCombine(level[0], level[l]); - level[l].size = 0; - } - } - utils::Error("adding more element than allowed"); - } - } - /*! - * \brief finalize the result after all data has been passed - * copy the final result to level 0 - * this can only be called once - */ - inline void Finalize(void) { - // start update sketch - std::sort(inqueue.begin(), inqueue.begin() + qtail); - for (size_t i = 0; i < qtail; ++i) { - temp.data[i] = Entry(i + 1, i + 1, inqueue[i]); - } - temp.size = static_cast(qtail); - if (temp.size < limit_size) { - level[0].CopyFrom(temp); - } else { - level[0].SetPrune(temp, limit_size); - } - // start adding other things in - for (size_t l = 1; l < level.size(); ++l) { - if (level[l].size == 0) continue; - if (level[0].size == 0) { - level[0].CopyFrom(level[l]); - } else { - temp.SetCombine(level[0], level[l]); - level[0].SetPrune(temp, limit_size); - } - level[l].size = 0; - } - } - /*! \brief get the summary after finalize */ - inline Summary GetSummary(void) const { - return level[0]; - } - - private: - // the input queue - std::vector inqueue; - // end of the queue - size_t qtail; - // size of summary in each level - size_t limit_size; - // content of the summary - std::vector data; - // different level of summary - std::vector level; - // temporal summary, used for temp-merge - SummaryContainer temp; +class GKQuantileSketch : + public QuantileSketchTemplate >{ }; + } // utils } // xgboost #endif diff --git a/test/test_quantile.cpp b/test/test_quantile.cpp index 89e976897..e6af5b1ec 100644 --- a/test/test_quantile.cpp +++ b/test/test_quantile.cpp @@ -2,8 +2,10 @@ #include using namespace xgboost; -int main(int argc, char *argv[]) { - utils::WQuantileSketch sketch; + +template +inline void test(void) { + Sketch sketch; size_t n; double wsum = 0.0; float eps, x, w; @@ -11,13 +13,25 @@ int main(int argc, char *argv[]) { sketch.Init(n, eps); printf("nlevel = %lu, limit_size=%lu\n", sketch.nlevel, sketch.limit_size); while (scanf("%f%f", &x, &w) == 2) { - sketch.Push(x, w); + sketch.Push(x, static_cast(w)); wsum += w; } - sketch.CheckValid(0.1); - utils::WQuantileSketch::SummaryContainer out; + sketch.CheckValid(static_cast(0.1)); + typename Sketch::SummaryContainer out; sketch.GetSummary(&out); - printf("MaxError=%f/%f = %g\n", out.MaxError(), wsum, out.MaxError() / wsum); + double maxerr = static_cast(out.MaxError()); + printf("MaxError=%g/%g = %g\n", maxerr, wsum, maxerr / wsum); out.Print(); +} + +int main(int argc, char *argv[]) { + const char *method = "wq"; + if (argc > 1) method = argv[1]; + if (!strcmp(method, "wq")) { + test, float>(); + } + if (!strcmp(method, "gk")) { + test, unsigned>(); + } return 0; }