xgboost/src/c_api/c_api.cc

// Copyright (c) 2014 by Contributors

#include <xgboost/data.h>
#include <xgboost/learner.h>
#include <xgboost/c_api.h>
#include <xgboost/logging.h>
#include <rabit/rabit.h>
#include <cstdio>
#include <vector>
#include <string>
#include <cstring>
#include <memory>

#include "./c_api_error.h"
#include "../data/simple_csr_source.h"
#include "../common/thread_local.h"
#include "../common/math.h"
#include "../common/io.h"
#include "../common/group_data.h"

namespace xgboost {
// booster wrapper for backward compatible reason.
class Booster {
 public:
  explicit Booster(const std::vector<DMatrix*>& cache_mats)
      : configured_(false),
        initialized_(false),
        learner_(Learner::Create(cache_mats)) {}

  inline Learner* learner() {
    return learner_.get();
  }

  inline void SetParam(const std::string& name, const std::string& val) {
    auto it = std::find_if(cfg_.begin(), cfg_.end(),
      [&name](decltype(*cfg_.begin()) &x) {
        return x.first == name;
      }
    );
    if (it == cfg_.end()) {
      cfg_.push_back(std::make_pair(name, val));
    } else {
      (*it).second = val;
    }
    if (configured_) {
      learner_->Configure(cfg_);
    }
  }

  inline void LazyInit() {
    if (!configured_) {
      learner_->Configure(cfg_);
      configured_ = true;
    }
    if (!initialized_) {
      learner_->InitModel();
      initialized_ = true;
    }
  }

  inline void LoadModel(dmlc::Stream* fi) {
    learner_->Load(fi);
    initialized_ = true;
  }

 public:
  bool configured_;
  bool initialized_;
  std::unique_ptr<Learner> learner_;
  std::vector<std::pair<std::string, std::string> > cfg_;
};

// declare the data callback.
XGB_EXTERN_C int XGBoostNativeDataIterSetData(
    void *handle, XGBoostBatchCSR batch);

/*! \brief Native data iterator that takes callback to return data */
class NativeDataIter : public dmlc::Parser<uint32_t> {
 public:
  NativeDataIter(DataIterHandle data_handle,
                 XGBCallbackDataIterNext* next_callback)
      :  at_first_(true), bytes_read_(0),
         data_handle_(data_handle), next_callback_(next_callback) {
  }

  // override functions
  void BeforeFirst() override {
    CHECK(at_first_) << "cannot reset NativeDataIter";
  }

  bool Next() override {
    if ((*next_callback_)(
            data_handle_,
            XGBoostNativeDataIterSetData,
            this) != 0) {
      at_first_ = false;
      return true;
    } else {
      return false;
    }
  }

  const dmlc::RowBlock<uint32_t>& Value() const override {
    return block_;
  }

  size_t BytesRead() const override {
    return bytes_read_;
  }

  // callback to set the data
  void SetData(const XGBoostBatchCSR& batch) {
    offset_.clear();
    label_.clear();
    weight_.clear();
    index_.clear();
    value_.clear();
    offset_.insert(offset_.end(), batch.offset, batch.offset + batch.size + 1);
    if (batch.label != nullptr) {
      label_.insert(label_.end(), batch.label, batch.label + batch.size);
    }
    if (batch.weight != nullptr) {
      weight_.insert(weight_.end(), batch.weight, batch.weight + batch.size);
    }
    if (batch.index != nullptr) {
      index_.insert(index_.end(), batch.index + offset_[0], batch.index + offset_.back());
    }
    if (batch.value != nullptr) {
      value_.insert(value_.end(), batch.value + offset_[0], batch.value + offset_.back());
    }
    if (offset_[0] != 0) {
      size_t base = offset_[0];
      for (size_t& item : offset_) {
        item -= base;
      }
    }
    block_.size = batch.size;
    block_.offset = dmlc::BeginPtr(offset_);
    block_.label = dmlc::BeginPtr(label_);
    block_.weight = dmlc::BeginPtr(weight_);
    block_.index = dmlc::BeginPtr(index_);
    block_.value = dmlc::BeginPtr(value_);
    bytes_read_ += offset_.size() * sizeof(size_t) +
        label_.size() * sizeof(dmlc::real_t) +
        weight_.size() * sizeof(dmlc::real_t) +
        index_.size() * sizeof(uint32_t) +
        value_.size() * sizeof(dmlc::real_t);
  }

 private:
  // at the beinning.
  bool at_first_;
  // bytes that is read.
  size_t bytes_read_;
  // handle to the iterator,
  DataIterHandle data_handle_;
  // call back to get the data.
  XGBCallbackDataIterNext* next_callback_;
  // internal offset
  std::vector<size_t> offset_;
  // internal label data
  std::vector<dmlc::real_t> label_;
  // internal weight data
  std::vector<dmlc::real_t> weight_;
  // internal index.
  std::vector<uint32_t> index_;
  // internal value.
  std::vector<dmlc::real_t> value_;
  // internal Rowblock
  dmlc::RowBlock<uint32_t> block_;
};

int XGBoostNativeDataIterSetData(
    void *handle, XGBoostBatchCSR batch) {
  API_BEGIN();
  static_cast<xgboost::NativeDataIter*>(handle)->SetData(batch);
  API_END();
}
}  // namespace xgboost

using namespace xgboost; // NOLINT(*);

/*! \brief entry to to easily hold returning information */
struct XGBAPIThreadLocalEntry {
  /*! \brief result holder for returning string */
  std::string ret_str;
  /*! \brief result holder for returning strings */
  std::vector<std::string> ret_vec_str;
  /*! \brief result holder for returning string pointers */
  std::vector<const char *> ret_vec_charp;
  /*! \brief returning float vector. */
  std::vector<float> ret_vec_float;
  /*! \brief temp variable of gradient pairs. */
  std::vector<bst_gpair> tmp_gpair;
};

// define the threadlocal store.
typedef xgboost::common::ThreadLocalStore<XGBAPIThreadLocalEntry> XGBAPIThreadLocalStore;

int XGDMatrixCreateFromFile(const char *fname,
                            int silent,
                            DMatrixHandle *out) {
  API_BEGIN();
  if (rabit::IsDistributed()) {
    LOG(CONSOLE) << "XGBoost distributed mode detected, "
                 << "will split data among workers";
  }
  *out = DMatrix::Load(
      fname, false, true);
  API_END();
}

int XGDMatrixCreateFromDataIter(
    void* data_handle,
    XGBCallbackDataIterNext* callback,
    const char *cache_info,
    DMatrixHandle *out) {
  API_BEGIN();

  std::string scache;
  if (cache_info != nullptr) {
    scache = cache_info;
  }
  NativeDataIter parser(data_handle, callback);
  *out = DMatrix::Create(&parser, scache);
  API_END();
}

int XGDMatrixCreateFromCSR(const bst_ulong* indptr,
                           const unsigned *indices,
                           const float* data,
                           bst_ulong nindptr,
                           bst_ulong nelem,
                           DMatrixHandle* out) {
  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());

  API_BEGIN();
  data::SimpleCSRSource& mat = *source;
  mat.row_ptr_.resize(nindptr);
  for (bst_ulong i = 0; i < nindptr; ++i) {
    mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
  }
  mat.row_data_.resize(nelem);
  for (bst_ulong i = 0; i < nelem; ++i) {
    mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
    mat.info.num_col = std::max(mat.info.num_col,
                                static_cast<uint64_t>(indices[i] + 1));
  }
  mat.info.num_row = nindptr - 1;
  mat.info.num_nonzero = static_cast<uint64_t>(nelem);
  *out  = DMatrix::Create(std::move(source));
  API_END();
}

int XGDMatrixCreateFromCSC(const bst_ulong* col_ptr,
                           const unsigned* indices,
                           const float* data,
                           bst_ulong nindptr,
                           bst_ulong nelem,
                           DMatrixHandle* out) {
  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());

  API_BEGIN();
  int nthread;
  #pragma omp parallel
  {
    nthread = omp_get_num_threads();
  }
  data::SimpleCSRSource& mat = *source;
  common::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
  builder.InitBudget(0, nthread);
  long ncol = static_cast<long>(nindptr - 1);  // NOLINT(*)
  #pragma omp parallel for schedule(static)
  for (long i = 0; i < ncol; ++i) {  // NOLINT(*)
    int tid = omp_get_thread_num();
    for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
      builder.AddBudget(indices[j], tid);
    }
  }
  builder.InitStorage();
  #pragma omp parallel for schedule(static)
  for (long i = 0; i < ncol; ++i) {  // NOLINT(*)
    int tid = omp_get_thread_num();
    for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
      builder.Push(indices[j],
                   RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
                   tid);
    }
  }
  mat.info.num_row = mat.row_ptr_.size() - 1;
  mat.info.num_col = static_cast<uint64_t>(ncol);
  mat.info.num_nonzero = nelem;
  *out  = DMatrix::Create(std::move(source));
  API_END();
}

int XGDMatrixCreateFromMat(const float* data,
                           bst_ulong nrow,
                           bst_ulong ncol,
                           float  missing,
                           DMatrixHandle* out) {
  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());

  API_BEGIN();
  data::SimpleCSRSource& mat = *source;
  bool nan_missing = common::CheckNAN(missing);
  mat.info.num_row = nrow;
  mat.info.num_col = ncol;
  for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
    bst_ulong nelem = 0;
    for (bst_ulong j = 0; j < ncol; ++j) {
      if (common::CheckNAN(data[j])) {
        CHECK(nan_missing)
            << "There are NAN in the matrix, however, you did not set missing=NAN";
      } else {
        if (nan_missing || data[j] != missing) {
          mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
          ++nelem;
        }
      }
    }
    mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem);
  }
  mat.info.num_nonzero = mat.row_data_.size();
  *out  = DMatrix::Create(std::move(source));
  API_END();
}

int XGDMatrixSliceDMatrix(DMatrixHandle handle,
                          const int* idxset,
                          bst_ulong len,
                          DMatrixHandle* out) {
  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());

  API_BEGIN();
  data::SimpleCSRSource src;
  src.CopyFrom(static_cast<DMatrix*>(handle));
  data::SimpleCSRSource& ret = *source;

  CHECK_EQ(src.info.group_ptr.size(), 0)
      << "slice does not support group structure";

  ret.Clear();
  ret.info.num_row = len;
  ret.info.num_col = src.info.num_col;

  dmlc::DataIter<RowBatch>* iter = &src;
  iter->BeforeFirst();
  CHECK(iter->Next());

  const RowBatch& batch = iter->Value();
  for (bst_ulong i = 0; i < len; ++i) {
    const int ridx = idxset[i];
    RowBatch::Inst inst = batch[ridx];
    CHECK_LT(static_cast<bst_ulong>(ridx), batch.size);
    ret.row_data_.resize(ret.row_data_.size() + inst.length);
    std::memcpy(dmlc::BeginPtr(ret.row_data_) + ret.row_ptr_.back(), inst.data,
                sizeof(RowBatch::Entry) * inst.length);
    ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
    ret.info.num_nonzero += inst.length;

    if (src.info.labels.size() != 0) {
      ret.info.labels.push_back(src.info.labels[ridx]);
    }
    if (src.info.weights.size() != 0) {
      ret.info.weights.push_back(src.info.weights[ridx]);
    }
    if (src.info.root_index.size() != 0) {
      ret.info.root_index.push_back(src.info.root_index[ridx]);
    }
  }
  *out  = DMatrix::Create(std::move(source));
  API_END();
}

int XGDMatrixFree(DMatrixHandle handle) {
  API_BEGIN();
  delete static_cast<DMatrix*>(handle);
  API_END();
}

int XGDMatrixSaveBinary(DMatrixHandle handle,
                        const char* fname,
                        int silent) {
  API_BEGIN();
  static_cast<DMatrix*>(handle)->SaveToLocalFile(fname);
  API_END();
}

int XGDMatrixSetFloatInfo(DMatrixHandle handle,
                          const char* field,
                          const float* info,
                          bst_ulong len) {
  API_BEGIN();
  static_cast<DMatrix*>(handle)->info().SetInfo(field, info, kFloat32, len);
  API_END();
}

int XGDMatrixSetUIntInfo(DMatrixHandle handle,
                         const char* field,
                         const unsigned* info,
                         bst_ulong len) {
  API_BEGIN();
  static_cast<DMatrix*>(handle)->info().SetInfo(field, info, kUInt32, len);
  API_END();
}

int XGDMatrixSetGroup(DMatrixHandle handle,
                      const unsigned* group,
                      bst_ulong len) {
  API_BEGIN();
  DMatrix *pmat = static_cast<DMatrix*>(handle);
  MetaInfo& info = pmat->info();
  info.group_ptr.resize(len + 1);
  info.group_ptr[0] = 0;
  for (uint64_t i = 0; i < len; ++i) {
    info.group_ptr[i + 1] = info.group_ptr[i] + group[i];
  }
  API_END();
}

int XGDMatrixGetFloatInfo(const DMatrixHandle handle,
                          const char* field,
                          bst_ulong* out_len,
                          const float** out_dptr) {
  API_BEGIN();
  const MetaInfo& info = static_cast<const DMatrix*>(handle)->info();
  const std::vector<float>* vec = nullptr;
  if (!std::strcmp(field, "label")) {
    vec = &info.labels;
  } else if (!std::strcmp(field, "weight")) {
    vec = &info.weights;
  } else if (!std::strcmp(field, "base_margin")) {
    vec = &info.base_margin;
  } else {
    LOG(FATAL) << "Unknown float field name " << field;
  }
  *out_len = static_cast<bst_ulong>(vec->size());
  *out_dptr = dmlc::BeginPtr(*vec);
  API_END();
}

int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
                         const char *field,
                         bst_ulong *out_len,
                         const unsigned **out_dptr) {
  API_BEGIN();
  const MetaInfo& info = static_cast<const DMatrix*>(handle)->info();
  const std::vector<unsigned>* vec = nullptr;
  if (!std::strcmp(field, "root_index")) {
    vec = &info.root_index;
  } else {
    LOG(FATAL) << "Unknown uint field name " << field;
  }
  *out_len = static_cast<bst_ulong>(vec->size());
  *out_dptr = dmlc::BeginPtr(*vec);
  API_END();
}

int XGDMatrixNumRow(const DMatrixHandle handle,
                    bst_ulong *out) {
  API_BEGIN();
  *out = static_cast<bst_ulong>(static_cast<const DMatrix*>(handle)->info().num_row);
  API_END();
}

int XGDMatrixNumCol(const DMatrixHandle handle,
                    bst_ulong *out) {
  API_BEGIN();
  *out = static_cast<size_t>(static_cast<const DMatrix*>(handle)->info().num_col);
  API_END();
}

// xgboost implementation
int XGBoosterCreate(const DMatrixHandle dmats[],
                    bst_ulong len,
                    BoosterHandle *out) {
  API_BEGIN();
  std::vector<DMatrix*> mats;
  for (bst_ulong i = 0; i < len; ++i) {
    mats.push_back(static_cast<DMatrix*>(dmats[i]));
  }
  *out = new Booster(mats);
  API_END();
}

int XGBoosterFree(BoosterHandle handle) {
  API_BEGIN();
  delete static_cast<Booster*>(handle);
  API_END();
}

int XGBoosterSetParam(BoosterHandle handle,
                      const char *name,
                      const char *value) {
  API_BEGIN();
  static_cast<Booster*>(handle)->SetParam(name, value);
  API_END();
}

int XGBoosterUpdateOneIter(BoosterHandle handle,
                           int iter,
                           DMatrixHandle dtrain) {
  API_BEGIN();
  Booster* bst = static_cast<Booster*>(handle);
  DMatrix *dtr = static_cast<DMatrix*>(dtrain);

  bst->LazyInit();
  bst->learner()->UpdateOneIter(iter, dtr);
  API_END();
}

int XGBoosterBoostOneIter(BoosterHandle handle,
                          DMatrixHandle dtrain,
                          float *grad,
                          float *hess,
                          bst_ulong len) {
  std::vector<bst_gpair>& tmp_gpair = XGBAPIThreadLocalStore::Get()->tmp_gpair;
  API_BEGIN();
  Booster* bst = static_cast<Booster*>(handle);
  DMatrix* dtr = static_cast<DMatrix*>(dtrain);
  tmp_gpair.resize(len);
  for (bst_ulong i = 0; i < len; ++i) {
    tmp_gpair[i] = bst_gpair(grad[i], hess[i]);
  }

  bst->LazyInit();
  bst->learner()->BoostOneIter(0, dtr, &tmp_gpair);
  API_END();
}

int XGBoosterEvalOneIter(BoosterHandle handle,
                         int iter,
                         DMatrixHandle dmats[],
                         const char* evnames[],
                         bst_ulong len,
                         const char** out_str) {
  std::string& eval_str = XGBAPIThreadLocalStore::Get()->ret_str;
  API_BEGIN();
  Booster* bst = static_cast<Booster*>(handle);
  std::vector<DMatrix*> data_sets;
  std::vector<std::string> data_names;

  for (bst_ulong i = 0; i < len; ++i) {
    data_sets.push_back(static_cast<DMatrix*>(dmats[i]));
    data_names.push_back(std::string(evnames[i]));
  }

  bst->LazyInit();
  eval_str = bst->learner()->EvalOneIter(iter, data_sets, data_names);
  *out_str = eval_str.c_str();
  API_END();
}

int XGBoosterPredict(BoosterHandle handle,
                     DMatrixHandle dmat,
                     int option_mask,
                     unsigned ntree_limit,
                     bst_ulong *len,
                     const float **out_result) {
  std::vector<float>& preds = XGBAPIThreadLocalStore::Get()->ret_vec_float;
  API_BEGIN();
  Booster *bst = static_cast<Booster*>(handle);
  bst->LazyInit();
  bst->learner()->Predict(
      static_cast<DMatrix*>(dmat),
      (option_mask & 1) != 0,
      &preds, ntree_limit,
      (option_mask & 2) != 0);
  *out_result = dmlc::BeginPtr(preds);
  *len = static_cast<bst_ulong>(preds.size());
  API_END();
}

int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
  API_BEGIN();
  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
  static_cast<Booster*>(handle)->LoadModel(fi.get());
  API_END();
}

int XGBoosterSaveModel(BoosterHandle handle, const char* fname) {
  API_BEGIN();
  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname, "w"));
  Booster *bst = static_cast<Booster*>(handle);
  bst->LazyInit();
  bst->learner()->Save(fo.get());
  API_END();
}

int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
                                 const void* buf,
                                 bst_ulong len) {
  API_BEGIN();
  common::MemoryFixSizeBuffer fs((void*)buf, len);  // NOLINT(*)
  static_cast<Booster*>(handle)->LoadModel(&fs);
  API_END();
}

int XGBoosterGetModelRaw(BoosterHandle handle,
                         bst_ulong* out_len,
                         const char** out_dptr) {
  std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
  raw_str.resize(0);

  API_BEGIN();
  common::MemoryBufferStream fo(&raw_str);
  Booster *bst = static_cast<Booster*>(handle);
  bst->LazyInit();
  bst->learner()->Save(&fo);
  *out_dptr = dmlc::BeginPtr(raw_str);
  *out_len = static_cast<bst_ulong>(raw_str.length());
  API_END();
}

inline void XGBoostDumpModelImpl(
    BoosterHandle handle,
    const FeatureMap& fmap,
    int with_stats,
    bst_ulong* len,
    const char*** out_models) {
  std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
  std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
  Booster *bst = static_cast<Booster*>(handle);
  bst->LazyInit();
  str_vecs = bst->learner()->Dump2Text(fmap, with_stats != 0);
  charp_vecs.resize(str_vecs.size());
  for (size_t i = 0; i < str_vecs.size(); ++i) {
    charp_vecs[i] = str_vecs[i].c_str();
  }
  *out_models = dmlc::BeginPtr(charp_vecs);
  *len = static_cast<bst_ulong>(charp_vecs.size());
}
int XGBoosterDumpModel(BoosterHandle handle,
                       const char* fmap,
                       int with_stats,
                       bst_ulong* len,
                       const char*** out_models) {
  API_BEGIN();
  FeatureMap featmap;
  if (strlen(fmap) != 0) {
    std::unique_ptr<dmlc::Stream> fs(
        dmlc::Stream::Create(fmap, "r"));
    dmlc::istream is(fs.get());
    featmap.LoadText(is);
  }
  XGBoostDumpModelImpl(handle, featmap, with_stats, len, out_models);
  API_END();
}

int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
                                   int fnum,
                                   const char** fname,
                                   const char** ftype,
                                   int with_stats,
                                   bst_ulong* len,
                                   const char*** out_models) {
  API_BEGIN();
  FeatureMap featmap;
  for (int i = 0; i < fnum; ++i) {
    featmap.PushBack(i, fname[i], ftype[i]);
  }
  XGBoostDumpModelImpl(handle, featmap, with_stats, len, out_models);
  API_END();
}

int XGBoosterGetAttr(BoosterHandle handle,
                     const char* key,
                     const char** out,
                     int* success) {
  Booster* bst = static_cast<Booster*>(handle);
  std::string& ret_str = XGBAPIThreadLocalStore::Get()->ret_str;
  API_BEGIN();
  if (bst->learner()->GetAttr(key, &ret_str)) {
    *out = ret_str.c_str();
    *success = 1;
  } else {
    *out = nullptr;
    *success = 0;
  }
  API_END();
}

int XGBoosterSetAttr(BoosterHandle handle,
                     const char* key,
                     const char* value) {
  Booster* bst = static_cast<Booster*>(handle);
  API_BEGIN();
  bst->learner()->SetAttr(key, value);
  API_END();
}

int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
                                 int* version) {
  API_BEGIN();
  Booster* bst = static_cast<Booster*>(handle);
  *version = rabit::LoadCheckPoint(bst->learner());
  if (*version != 0) {
    bst->initialized_ = true;
  }
  API_END();
}

int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) {
  API_BEGIN();
  Booster* bst = static_cast<Booster*>(handle);
  if (bst->learner()->AllowLazyCheckPoint()) {
    rabit::LazyCheckPoint(bst->learner());
  } else {
    rabit::CheckPoint(bst->learner());
  }
  API_END();
}

// force link rabit
static int XGBOOST_LINK_RABIT_C_API_ = RabitLinkTag();