Documenting CSV loading into DMatrix (#3137)
* Support CSV file in DMatrix We'd just need to expose the CSV parser in dmlc-core to the Python wrapper * Revert extra code; document existing CSV support CSV support is already there but undocumented * Add notice about categorical features
This commit is contained in:
parent
d5992dd881
commit
32ea70c1c9
@ -78,6 +78,8 @@ master_doc = 'index'
|
|||||||
# Usually you set "language" from the command line for these cases.
|
# Usually you set "language" from the command line for these cases.
|
||||||
language = None
|
language = None
|
||||||
|
|
||||||
|
autoclass_content = 'both'
|
||||||
|
|
||||||
# There are two options for replacing |today|: either, you set today to some
|
# There are two options for replacing |today|: either, you set today to some
|
||||||
# non-false value, then it is used:
|
# non-false value, then it is used:
|
||||||
#today = ''
|
#today = ''
|
||||||
|
|||||||
@ -25,7 +25,9 @@ Data Interface
|
|||||||
--------------
|
--------------
|
||||||
The XGBoost python module is able to load data from:
|
The XGBoost python module is able to load data from:
|
||||||
- libsvm txt format file
|
- libsvm txt format file
|
||||||
- Numpy 2D array, and
|
- comma-separated values (CSV) file
|
||||||
|
- Numpy 2D array
|
||||||
|
- Scipy 2D sparse array, and
|
||||||
- xgboost binary buffer file.
|
- xgboost binary buffer file.
|
||||||
|
|
||||||
The data is stored in a ```DMatrix``` object.
|
The data is stored in a ```DMatrix``` object.
|
||||||
@ -35,6 +37,16 @@ The data is stored in a ```DMatrix``` object.
|
|||||||
dtrain = xgb.DMatrix('train.svm.txt')
|
dtrain = xgb.DMatrix('train.svm.txt')
|
||||||
dtest = xgb.DMatrix('test.svm.buffer')
|
dtest = xgb.DMatrix('test.svm.buffer')
|
||||||
```
|
```
|
||||||
|
* To load a CSV file into ```DMatrix```:
|
||||||
|
```python
|
||||||
|
# label_column specifies the index of the column containing the true label
|
||||||
|
dtrain = xgb.DMatrix('train.csv?format=csv&label_column=0')
|
||||||
|
dtest = xgb.DMatrix('test.csv?format=csv&label_column=0')
|
||||||
|
```
|
||||||
|
(Note that XGBoost does not support categorical features; if your data contains
|
||||||
|
categorical features, load it as a numpy array first and then perform
|
||||||
|
[one-hot encoding](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html).)
|
||||||
|
|
||||||
* To load a numpy array into ```DMatrix```:
|
* To load a numpy array into ```DMatrix```:
|
||||||
```python
|
```python
|
||||||
data = np.random.rand(5, 10) # 5 entities, each contains 10 features
|
data = np.random.rand(5, 10) # 5 entities, each contains 10 features
|
||||||
|
|||||||
@ -235,8 +235,6 @@ class DMatrix(object):
|
|||||||
feature_names=None, feature_types=None,
|
feature_names=None, feature_types=None,
|
||||||
nthread=None):
|
nthread=None):
|
||||||
"""
|
"""
|
||||||
Data matrix used in XGBoost.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
data : string/numpy array/scipy.sparse/pd.DataFrame
|
data : string/numpy array/scipy.sparse/pd.DataFrame
|
||||||
@ -706,7 +704,7 @@ class DMatrix(object):
|
|||||||
|
|
||||||
|
|
||||||
class Booster(object):
|
class Booster(object):
|
||||||
""""A Booster of of XGBoost.
|
"""A Booster of of XGBoost.
|
||||||
|
|
||||||
Booster is the model of xgboost, that contains low level routines for
|
Booster is the model of xgboost, that contains low level routines for
|
||||||
training, prediction and evaluation.
|
training, prediction and evaluation.
|
||||||
@ -716,8 +714,7 @@ class Booster(object):
|
|||||||
|
|
||||||
def __init__(self, params=None, cache=(), model_file=None):
|
def __init__(self, params=None, cache=(), model_file=None):
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
"""Initialize the Booster.
|
"""
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
params : dict
|
params : dict
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user