Support dataframe data format in native XGBoost. (#9828)

- Implement a columnar adapter.
- Refactor Python pandas handling code to avoid converting into a single numpy array.
- Add support in R for transforming columns.
- Support R data.frame and factor type.
This commit is contained in:
Jiaming Yuan
2023-12-12 09:56:31 +08:00
committed by GitHub
parent b3700bbb3f
commit faf0f2df10
21 changed files with 718 additions and 221 deletions

View File

@@ -41,6 +41,7 @@ extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
extern SEXP XGDMatrixCreateFromDF_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
extern SEXP XGDMatrixNumCol_R(SEXP);
extern SEXP XGDMatrixNumRow_R(SEXP);
@@ -79,6 +80,7 @@ static const R_CallMethodDef CallEntries[] = {
{"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3},
{"XGDMatrixGetFloatInfo_R", (DL_FUNC) &XGDMatrixGetFloatInfo_R, 2},
{"XGDMatrixGetUIntInfo_R", (DL_FUNC) &XGDMatrixGetUIntInfo_R, 2},
{"XGDMatrixCreateFromDF_R", (DL_FUNC) &XGDMatrixCreateFromDF_R, 3},
{"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
{"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1},
{"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1},

View File

@@ -223,6 +223,69 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
return ret;
}
XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
R_API_BEGIN();
DMatrixHandle handle;
auto make_vec = [&](auto const *ptr, std::int32_t len) {
auto v = xgboost::linalg::MakeVec(ptr, len);
return xgboost::linalg::ArrayInterface(v);
};
std::int32_t rc{0};
{
using xgboost::Json;
auto n_features = Rf_xlength(df);
std::vector<Json> array(n_features);
CHECK_GT(n_features, 0);
auto len = Rf_xlength(VECTOR_ELT(df, 0));
// The `data.frame` in R actually converts all data into numeric. The other type
// handlers here are not used. At the moment they are kept as a reference for when we
// can avoid making data copies during transformation.
for (decltype(n_features) i = 0; i < n_features; ++i) {
switch (TYPEOF(VECTOR_ELT(df, i))) {
case INTSXP: {
auto const *ptr = INTEGER(VECTOR_ELT(df, i));
array[i] = make_vec(ptr, len);
break;
}
case REALSXP: {
auto const *ptr = REAL(VECTOR_ELT(df, i));
array[i] = make_vec(ptr, len);
break;
}
case LGLSXP: {
auto const *ptr = LOGICAL(VECTOR_ELT(df, i));
array[i] = make_vec(ptr, len);
break;
}
default: {
LOG(FATAL) << "data.frame has unsupported type.";
}
}
}
Json jinterface{std::move(array)};
auto sinterface = Json::Dump(jinterface);
Json jconfig{xgboost::Object{}};
jconfig["missing"] = asReal(missing);
jconfig["nthread"] = asInteger(n_threads);
auto sconfig = Json::Dump(jconfig);
rc = XGDMatrixCreateFromColumnar(sinterface.c_str(), sconfig.c_str(), &handle);
}
CHECK_CALL(rc);
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();
Rf_unprotect(1);
return ret;
}
namespace {
void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_str,
std::string *indices_str, std::string *data_str) {
@@ -298,6 +361,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
config.c_str(), &handle);
}
CHECK_CALL(res_code);
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();

View File

@@ -53,6 +53,16 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing,
SEXP n_threads);
/**
* @brief Create matrix content from a data frame.
* @param data R data.frame object
* @param missing which value to represent missing value
* @param n_threads Number of threads used to construct DMatrix from dense matrix.
* @return created dmatrix
*/
XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads);
/*!
* \brief create a matrix content from CSC format
* \param indptr pointer to column headers