make DMatrix._init_from_npy2d only copy data when necessary (#1637)
* make DMatrix._init_from_npy2d only copy data when necessary When creating DMatrix from a 2d ndarray, it can unnecessarily copy the input data. This can be problematic when the data is already very large--running out of memory. The copy is temporary (going out of scope at the end of this function) but it still adds to peak memory usage. ``numpy.array`` copies its input no matter what by default. By adding ``copy=False``, it will only do so when necessary. Since XGDMatrixCreateFromMat is readonly on the input buffer, this copy is not needed. Also added comments explaining when a copy can happen (if data ordering/layout is wrong or if type is not 32-bit float). * remove whitespace
This commit is contained in:
parent
e79a803a30
commit
9b2e41340b
@ -311,10 +311,21 @@ class DMatrix(object):
|
|||||||
def _init_from_npy2d(self, mat, missing):
|
def _init_from_npy2d(self, mat, missing):
|
||||||
"""
|
"""
|
||||||
Initialize data from a 2-D numpy matrix.
|
Initialize data from a 2-D numpy matrix.
|
||||||
|
|
||||||
|
If ``mat`` does not have ``order='C'`` (aka row-major) or is not contiguous,
|
||||||
|
a temporary copy will be made.
|
||||||
|
|
||||||
|
If ``mat`` does not have ``dtype=numpy.float32``, a temporary copy will be made.
|
||||||
|
|
||||||
|
So there could be as many as two temporary data copies; be mindful of input layout
|
||||||
|
and type if memory use is a concern.
|
||||||
"""
|
"""
|
||||||
if len(mat.shape) != 2:
|
if len(mat.shape) != 2:
|
||||||
raise ValueError('Input numpy.ndarray must be 2 dimensional')
|
raise ValueError('Input numpy.ndarray must be 2 dimensional')
|
||||||
data = np.array(mat.reshape(mat.size), dtype=np.float32)
|
# flatten the array by rows and ensure it is float32.
|
||||||
|
# we try to avoid data copies if possible (reshape returns a view when possible
|
||||||
|
# and we explicitly tell np.array to try and avoid copying)
|
||||||
|
data = np.array(mat.reshape(mat.size), copy=False, dtype=np.float32)
|
||||||
self.handle = ctypes.c_void_p()
|
self.handle = ctypes.c_void_p()
|
||||||
missing = missing if missing is not None else np.nan
|
missing = missing if missing is not None else np.nan
|
||||||
_check_call(_LIB.XGDMatrixCreateFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
_check_call(_LIB.XGDMatrixCreateFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user