Use array interface for CSC matrix. (#8672)

* Use array interface for CSC matrix.

Use array interface for CSC matrix and align the interface with CSR and dense.

- Fix nthread issue in the R package DMatrix.
- Unify the behavior of handling `missing` with other inputs.
- Unify the behavior of handling `missing` around R, Python, Java, and Scala DMatrix.
- Expose `num_non_missing` to the JVM interface.
- Deprecate old CSR and CSC constructors.
This commit is contained in:
Jiaming Yuan
2023-02-05 01:59:46 +08:00
committed by GitHub
parent 213b5602d9
commit c1786849e3
23 changed files with 673 additions and 380 deletions

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2014-2022 by Contributors
Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -79,17 +79,9 @@ public class DMatrix {
* @throws XGBoostError
*/
@Deprecated
public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType st)
throws XGBoostError {
long[] out = new long[1];
if (st == SparseType.CSR) {
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSREx(headers, indices, data, 0, out));
} else if (st == SparseType.CSC) {
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSCEx(headers, indices, data, 0, out));
} else {
throw new UnknownError("unknow sparsetype");
}
handle = out[0];
public DMatrix(long[] headers, int[] indices, float[] data,
DMatrix.SparseType st) throws XGBoostError {
this(headers, indices, data, st, 0, Float.NaN, -1);
}
/**
@@ -102,15 +94,20 @@ public class DMatrix {
* row number
* @throws XGBoostError
*/
public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType st, int shapeParam)
throws XGBoostError {
public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType st,
int shapeParam) throws XGBoostError {
this(headers, indices, data, st, shapeParam, Float.NaN, -1);
}
public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType st, int shapeParam,
float missing, int nthread) throws XGBoostError {
long[] out = new long[1];
if (st == SparseType.CSR) {
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSREx(headers, indices, data,
shapeParam, out));
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSR(headers, indices, data,
shapeParam, missing, nthread, out));
} else if (st == SparseType.CSC) {
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSCEx(headers, indices, data,
shapeParam, out));
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSC(headers, indices, data,
shapeParam, missing, nthread, out));
} else {
throw new UnknownError("unknow sparsetype");
}
@@ -425,6 +422,18 @@ public class DMatrix {
return rowNum[0];
}
/**
* Get the number of non-missing values of DMatrix.
*
* @return The number of non-missing values
* @throws XGBoostError native error
*/
public long nonMissingNum() throws XGBoostError {
long[] n = new long[1];
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixNumNonMissing(handle, n));
return n[0];
}
/**
* save DMatrix to filePath
*/

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2014-2022 by Contributors
Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -56,11 +56,15 @@ class XGBoostJNI {
final static native int XGDMatrixCreateFromDataIter(java.util.Iterator<DataBatch> iter,
String cache_info, long[] out);
public final static native int XGDMatrixCreateFromCSREx(long[] indptr, int[] indices, float[] data,
int shapeParam, long[] out);
public final static native int XGDMatrixCreateFromCSR(long[] indptr, int[] indices,
float[] data, int shapeParam,
float missing, int nthread,
long[] out);
public final static native int XGDMatrixCreateFromCSCEx(long[] colptr, int[] indices, float[] data,
int shapeParam, long[] out);
public final static native int XGDMatrixCreateFromCSC(long[] colptr, int[] indices,
float[] data, int shapeParam,
float missing, int nthread,
long[] out);
public final static native int XGDMatrixCreateFromMat(float[] data, int nrow, int ncol,
float missing, long[] out);
@@ -96,6 +100,7 @@ class XGBoostJNI {
long[] outLength, String[][] outValues);
public final static native int XGDMatrixNumRow(long handle, long[] row);
public final static native int XGDMatrixNumNonMissing(long handle, long[] nonMissings);
public final static native int XGBoosterCreate(long[] handles, long[] out);

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2014,2021 by Contributors
Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
@throws(classOf[XGBoostError])
@deprecated
def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType) {
this(new JDMatrix(headers, indices, data, st))
this(new JDMatrix(headers, indices, data, st, 0, Float.NaN, -1))
}
/**
@@ -70,7 +70,25 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
@throws(classOf[XGBoostError])
def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType,
shapeParam: Int) {
this(new JDMatrix(headers, indices, data, st, shapeParam))
this(new JDMatrix(headers, indices, data, st, shapeParam, Float.NaN, -1))
}
/**
* create DMatrix from sparse matrix
*
* @param headers index to headers (rowHeaders for CSR or colHeaders for CSC)
* @param indices Indices (colIndexs for CSR or rowIndexs for CSC)
* @param data non zero values (sequence by row for CSR or by col for CSC)
* @param st sparse matrix type (CSR or CSC)
* @param shapeParam when st is CSR, it specifies the column number, otherwise it is taken as
* row number
* @param missing missing value
* @param nthread The number of threads used for constructing DMatrix
*/
@throws(classOf[XGBoostError])
def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType,
shapeParam: Int, missing: Float, nthread: Int) {
this(new JDMatrix(headers, indices, data, st, shapeParam, missing, nthread))
}
/**
@@ -78,7 +96,7 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
* @param columnBatch the XGBoost ColumnBatch to provide the cuda array interface
* of feature columns
* @param missing missing value
* @param nthread threads number
* @param nthread The number of threads used for constructing DMatrix
*/
@throws(classOf[XGBoostError])
def this(columnBatch: ColumnBatch, missing: Float, nthread: Int) {
@@ -246,6 +264,16 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
jDMatrix.rowNum
}
/**
* Get the number of non-missing values of DMatrix.
*
* @return The number of non-missing values
*/
@throws(classOf[XGBoostError])
def nonMissingNum: Long = {
jDMatrix.nonMissingNum
}
/**
* save DMatrix to filePath
*

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2014-2022 by Contributors
/**
Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
@@ -12,18 +12,23 @@
limitations under the License.
*/
#include "./xgboost4j.h"
#include <rabit/c_api.h>
#include <xgboost/base.h>
#include <xgboost/c_api.h>
#include <xgboost/json.h>
#include <xgboost/logging.h>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <rabit/c_api.h>
#include <xgboost/c_api.h>
#include <xgboost/base.h>
#include <xgboost/logging.h>
#include <xgboost/json.h>
#include "./xgboost4j.h"
#include <cstring>
#include <vector>
#include <limits>
#include <string>
#include <type_traits>
#include <vector>
#include "../../../src/c_api/c_api_utils.h"
#define JVM_CHECK_CALL(__expr) \
{ \
@@ -219,58 +224,89 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFro
return ret;
}
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixCreateFromCSREx
* Signature: ([J[I[FI[J)I
namespace {
/**
* \brief Create from sparse matrix.
*
* \param maker Indirect call to XGBoost C function for creating CSC and CSR.
*
* \return Status
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSREx
(JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jcol, jlongArray jout) {
template <typename Fn>
jint MakeJVMSparseInput(JNIEnv *jenv, jlongArray jindptr, jintArray jindices, jfloatArray jdata,
jfloat jmissing, jint jnthread, Fn &&maker, jlongArray jout) {
DMatrixHandle result;
jlong* indptr = jenv->GetLongArrayElements(jindptr, 0);
jint* indices = jenv->GetIntArrayElements(jindices, 0);
jfloat* data = jenv->GetFloatArrayElements(jdata, 0);
bst_ulong nindptr = (bst_ulong)jenv->GetArrayLength(jindptr);
bst_ulong nelem = (bst_ulong)jenv->GetArrayLength(jdata);
jint ret = (jint) XGDMatrixCreateFromCSREx((size_t const *)indptr,
(unsigned int const *)indices,
(float const *)data,
nindptr, nelem, jcol, &result);
jlong *indptr = jenv->GetLongArrayElements(jindptr, nullptr);
jint *indices = jenv->GetIntArrayElements(jindices, nullptr);
jfloat *data = jenv->GetFloatArrayElements(jdata, nullptr);
bst_ulong nindptr = static_cast<bst_ulong>(jenv->GetArrayLength(jindptr));
bst_ulong nelem = static_cast<bst_ulong>(jenv->GetArrayLength(jdata));
std::string sindptr, sindices, sdata;
CHECK_EQ(indptr[nindptr - 1], nelem);
using IndPtrT = std::conditional_t<std::is_convertible<jlong *, long *>::value, long, long long>;
using IndT =
std::conditional_t<std::is_convertible<jint *, std::int32_t *>::value, std::int32_t, long>;
xgboost::detail::MakeSparseFromPtr(
static_cast<IndPtrT const *>(indptr), static_cast<IndT const *>(indices),
static_cast<float const *>(data), nindptr, &sindptr, &sindices, &sdata);
xgboost::Json jconfig{xgboost::Object{}};
auto missing = static_cast<float>(jmissing);
auto n_threads = static_cast<std::int32_t>(jnthread);
// Construct configuration
jconfig["nthread"] = xgboost::Integer{n_threads};
jconfig["missing"] = xgboost::Number{missing};
std::string config;
xgboost::Json::Dump(jconfig, &config);
jint ret = maker(sindptr.c_str(), sindices.c_str(), sdata.c_str(), config.c_str(), &result);
JVM_CHECK_CALL(ret);
setHandle(jenv, jout, result);
//Release
// Release
jenv->ReleaseLongArrayElements(jindptr, indptr, 0);
jenv->ReleaseIntArrayElements(jindices, indices, 0);
jenv->ReleaseFloatArrayElements(jdata, data, 0);
return ret;
}
} // anonymous namespace
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixCreateFromCSCEx
* Signature: ([J[I[FI[J)I
* Method: XGDMatrixCreateFromCSR
* Signature: ([J[I[FIFI[J)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSCEx
(JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jrow, jlongArray jout) {
DMatrixHandle result;
jlong* indptr = jenv->GetLongArrayElements(jindptr, NULL);
jint* indices = jenv->GetIntArrayElements(jindices, 0);
jfloat* data = jenv->GetFloatArrayElements(jdata, NULL);
bst_ulong nindptr = (bst_ulong)jenv->GetArrayLength(jindptr);
bst_ulong nelem = (bst_ulong)jenv->GetArrayLength(jdata);
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSR(
JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jcol,
jfloat jmissing, jint jnthread, jlongArray jout) {
using CSTR = char const *;
return MakeJVMSparseInput(
jenv, jindptr, jindices, jdata, jmissing, jnthread,
[&](CSTR sindptr, CSTR sindices, CSTR sdata, CSTR sconfig, DMatrixHandle *result) {
return XGDMatrixCreateFromCSR(sindptr, sindices, sdata, static_cast<std::int32_t>(jcol),
sconfig, result);
},
jout);
}
jint ret = (jint) XGDMatrixCreateFromCSCEx((size_t const *)indptr,
(unsigned int const *)indices,
(float const *)data,
nindptr, nelem, jrow, &result);
JVM_CHECK_CALL(ret);
setHandle(jenv, jout, result);
//release
jenv->ReleaseLongArrayElements(jindptr, indptr, 0);
jenv->ReleaseIntArrayElements(jindices, indices, 0);
jenv->ReleaseFloatArrayElements(jdata, data, 0);
return ret;
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixCreateFromCSC
* Signature: ([J[I[FIFI[J)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSC(
JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jrow,
jfloat jmissing, jint jnthread, jlongArray jout) {
using CSTR = char const *;
return MakeJVMSparseInput(
jenv, jindptr, jindices, jdata, jmissing, jnthread,
[&](CSTR sindptr, CSTR sindices, CSTR sdata, CSTR sconfig, DMatrixHandle *result) {
return XGDMatrixCreateFromCSC(sindptr, sindices, sdata, static_cast<bst_ulong>(jrow),
sconfig, result);
},
jout);
}
/*
@@ -459,6 +495,23 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumRow
return ret;
}
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixNumNonMissing
* Signature: (J[J)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumNonMissing(
JNIEnv *jenv, jclass, jlong jhandle, jlongArray jout) {
DMatrixHandle handle = reinterpret_cast<DMatrixHandle>(jhandle);
CHECK(handle);
bst_ulong result[1];
auto ret = static_cast<jint>(XGDMatrixNumNonMissing(handle, result));
jlong jresult[1]{static_cast<jlong>(result[0])};
jenv->SetLongArrayRegion(jout, 0, 1, jresult);
JVM_CHECK_CALL(ret);
return ret;
}
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGBoosterCreate

View File

@@ -33,19 +33,19 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFro
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixCreateFromCSREx
* Signature: ([J[I[FI[J)I
* Method: XGDMatrixCreateFromCSR
* Signature: ([J[I[FIFI[J)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSREx
(JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jlongArray);
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSR
(JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jfloat, jint, jlongArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixCreateFromCSCEx
* Signature: ([J[I[FI[J)I
* Method: XGDMatrixCreateFromCSC
* Signature: ([J[I[FIFI[J)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSCEx
(JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jlongArray);
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSC
(JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jfloat, jint, jlongArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
@@ -119,6 +119,22 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetFloatI
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetUIntInfo
(JNIEnv *, jclass, jlong, jstring, jobjectArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixSetStrFeatureInfo
* Signature: (JLjava/lang/String;[Ljava/lang/String;)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetStrFeatureInfo
(JNIEnv *, jclass, jlong, jstring, jobjectArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixGetStrFeatureInfo
* Signature: (JLjava/lang/String;[J[[Ljava/lang/String;)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetStrFeatureInfo
(JNIEnv *, jclass, jlong, jstring, jlongArray, jobjectArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixNumRow
@@ -127,6 +143,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetUIntIn
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumRow
(JNIEnv *, jclass, jlong, jlongArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixNumNonMissing
* Signature: (J[J)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumNonMissing
(JNIEnv *, jclass, jlong, jlongArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGBoosterCreate
@@ -351,7 +375,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDeviceQuantileDM
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGQuantileDMatrixCreateFromCallback
(JNIEnv *, jclass, jobject, jobject, jstring, jlongArray);
/*
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixCreateFromArrayInterfaceColumns
* Signature: (Ljava/lang/String;FI[J)I
@@ -359,22 +383,6 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGQuantileDMatrixC
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromArrayInterfaceColumns
(JNIEnv *, jclass, jstring, jfloat, jint, jlongArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixSetStrFeatureInfo
* Signature: (JLjava/lang/String;[Ljava/lang/String;)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetStrFeatureInfo
(JNIEnv *, jclass, jlong, jstring, jobjectArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixGetStrFeatureInfo
* Signature: (JLjava/lang/String;[J[[Ljava/lang/String;)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetStrFeatureInfo
(JNIEnv *, jclass, jlong, jstring, jlongArray, jobjectArray);
#ifdef __cplusplus
}
#endif

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2014 by Contributors
Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -54,6 +54,9 @@ class DMatrixSuite extends FunSuite {
dmat1.setLabel(label1)
val label2 = dmat1.getLabel
assert(label2 === label1)
val dmat2 = new DMatrix(rowHeaders, colIndex, data, JDMatrix.SparseType.CSR, 5, 1.0f, -1)
assert(dmat2.nonMissingNum === 9);
}
test("create DMatrix from CSREx") {
@@ -94,6 +97,9 @@ class DMatrixSuite extends FunSuite {
dmat1.setLabel(label1)
val label2 = dmat1.getLabel
assert(label2 === label1)
val dmat2 = new DMatrix(colHeaders, rowIndex, data, JDMatrix.SparseType.CSC, 5, 1.0f, -1)
assert(dmat2.nonMissingNum === 9);
}
test("create DMatrix from CSCEx") {