check unity back

This commit is contained in:
tqchen 2014-08-29 18:31:24 -07:00
parent 04c520ea3d
commit 551b3b70f1
6 changed files with 844 additions and 32 deletions

View File

@ -5,6 +5,8 @@
#include "../utils/io.h" #include "../utils/io.h"
#include "../utils/utils.h" #include "../utils/utils.h"
#include "simple_dmatrix-inl.hpp" #include "simple_dmatrix-inl.hpp"
#include "page_dmatrix-inl.hpp"
// implements data loads using dmatrix simple for now // implements data loads using dmatrix simple for now
namespace xgboost { namespace xgboost {

204
src/io/page_dmatrix-inl.hpp Normal file
View File

@ -0,0 +1,204 @@
#ifndef XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
#define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
/*!
* \file page_row_iter-inl.hpp
* row iterator based on sparse page
* \author Tianqi Chen
*/
#include "../data.h"
#include "../utils/iterator.h"
#include "../utils/thread_buffer.h"
namespace xgboost {
namespace io {
/*! \brief page structure that can be used to store a rowbatch */
struct RowBatchPage {
public:
RowBatchPage(void) {
data_ = new int[kPageSize];
utils::Assert(data_ != NULL, "fail to allocate row batch page");
this->Clear();
}
~RowBatchPage(void) {
if (data_ != NULL) delete [] data_;
}
/*!
* \brief Push one row into page
* \param row an instance row
* \return false or true to push into
*/
inline bool PushRow(const RowBatch::Inst &row) {
const size_t dsize = row.length * sizeof(RowBatch::Entry);
if (FreeBytes() < dsize+ sizeof(int)) return false;
row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
memcpy(data_ptr(Size()) , row.data, dsize);
++ data_[0];
return true;
}
/*!
* \brief get a row batch representation from the page
* \param p_rptr a temporal space that can be used to provide
* ind_ptr storage for RowBatch
* \return a new RowBatch object
*/
inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
RowBatch batch;
batch.base_rowid = base_rowid;
batch.data_ptr = this->data_ptr(0);
batch.size = static_cast<size_t>(this->Size());
std::vector<size_t> &rptr = *p_rptr;
rptr.resize(this->Size()+1);
for (size_t i = 0; i < rptr.size(); ++i) {
rptr[i] = static_cast<size_t>(this->row_ptr(i));
}
batch.ind_ptr = &rptr[0];
return batch;
}
/*!
* \brief clear the page, cleanup the content
*/
inline void Clear(void) {
memset(&data_[0], 0, sizeof(int) * kPageSize);
}
/*!
* \brief load one page form instream
* \return true if loading is successful
*/
inline bool Load(utils::IStream &fi) {
return fi.Read(&data_[0], sizeof(int) * kPageSize) != 0;
}
/*! \brief save one page into outstream */
inline void Save(utils::IStream &fo) {
fo.Write(&data_[0], sizeof(int) * kPageSize);
}
/*! \return number of elements */
inline int Size(void) const {
return data_[0];
}
/*! \brief page size 64 MB */
static const size_t kPageSize = 64 << 18;
private:
/*! \return number of elements */
inline size_t FreeBytes(void) {
return (kPageSize - (Size() + 2)) * sizeof(int)
- row_ptr(Size()) * sizeof(RowBatch::Entry) ;
}
/*! \brief equivalent row pointer at i */
inline int& row_ptr(int i) {
return data_[kPageSize - i - 1];
}
inline RowBatch::Entry* data_ptr(int i) {
return (RowBatch::Entry*)(&data_[1]) + i;
}
// content of data
int *data_;
};
/*! \brief thread buffer iterator */
class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
public:
ThreadRowPageIterator(void) {
itr.SetParam("buffer_size", "4");
page_ = NULL;
base_rowid_ = 0;
isend_ = false;
}
virtual ~ThreadRowPageIterator(void) {
}
virtual void Init(void) {
}
virtual void BeforeFirst(void) {
itr.BeforeFirst();
isend_ = false;
base_rowid_ = 0;
utils::Assert(this->LoadNextPage(), "ThreadRowPageIterator");
}
virtual bool Next(void) {
if(!this->LoadNextPage()) return false;
out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
base_rowid_ += out_.size;
return true;
}
virtual const RowBatch &Value(void) const{
return out_;
}
/*! \brief load and initialize the iterator with fi */
inline void Load(const utils::FileStream &fi) {
itr.get_factory().SetFile(fi);
itr.Init();
this->BeforeFirst();
}
/*!
* \brief save a row iterator to output stream, in row iterator format
*/
inline static void Save(utils::IIterator<RowBatch> *iter,
utils::IStream &fo) {
RowBatchPage page;
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (!page.PushRow(batch[i])) {
page.Save(fo);
page.Clear();
utils::Check(page.PushRow(batch[i]), "row is too big");
}
}
}
if (page.Size() != 0) page.Save(fo);
}
private:
// load in next page
inline bool LoadNextPage(void) {
ptop_ = 0;
bool ret = itr.Next(page_);
isend_ = !ret;
return ret;
}
// base row id
size_t base_rowid_;
// temporal ptr
std::vector<size_t> tmp_ptr_;
// output data
RowBatch out_;
// whether we reach end of file
bool isend_;
// page pointer type
typedef RowBatchPage* PagePtr;
// loader factory for page
struct Factory {
public:
size_t file_begin_;
utils::FileStream fi;
Factory(void) {}
inline void SetFile(const utils::FileStream &fi) {
this->fi = fi;
file_begin_ = this->fi.Tell();
}
inline bool Init(void) {
return true;
}
inline void SetParam(const char *name, const char *val) {}
inline bool LoadNext(PagePtr &val) {
return val->Load(fi);
}
inline PagePtr Create(void) {
PagePtr a = new RowBatchPage();
return a;
}
inline void FreeSpace(PagePtr &a) {
delete a;
}
inline void Destroy(void) {}
inline void BeforeFirst(void) {
fi.Seek(file_begin_);
}
};
protected:
PagePtr page_;
int ptop_;
utils::ThreadBuffer<PagePtr,Factory> itr;
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_

View File

@ -88,11 +88,19 @@ class IStream {
} }
}; };
/*! \brief implementation of file i/o stream */ /*! \brief interface of i/o stream that support seek */
class FileStream : public IStream { class ISeekStream: public IStream {
private:
FILE *fp;
public: public:
/*! \brief seek to certain position of the file */
virtual void Seek(size_t pos) = 0;
/*! \brief tell the position of the stream */
virtual size_t Tell(void) = 0;
};
/*! \brief implementation of file i/o stream */
class FileStream : public ISeekStream {
public:
explicit FileStream(void) {}
explicit FileStream(FILE *fp) { explicit FileStream(FILE *fp) {
this->fp = fp; this->fp = fp;
} }
@ -102,12 +110,18 @@ class FileStream : public IStream {
virtual void Write(const void *ptr, size_t size) { virtual void Write(const void *ptr, size_t size) {
fwrite(ptr, size, 1, fp); fwrite(ptr, size, 1, fp);
} }
inline void Seek(size_t pos) { virtual void Seek(size_t pos) {
fseek(fp, 0, SEEK_SET); fseek(fp, pos, SEEK_SET);
}
virtual size_t Tell(void) {
return static_cast<size_t>(ftell(fp));
} }
inline void Close(void) { inline void Close(void) {
fclose(fp); fclose(fp);
} }
private:
FILE *fp;
}; };
} // namespace utils } // namespace utils

146
src/utils/thread.h Normal file
View File

@ -0,0 +1,146 @@
#ifndef XGBOOST_UTILS_THREAD_H
#define XGBOOST_UTILS_THREAD_H
/*!
* \file thread.h
* \brief this header include the minimum necessary resource for multi-threading
* \author Tianqi Chen
* Acknowledgement: this file is adapted from SVDFeature project, by same author.
* The MAC support part of this code is provided by Artemy Kolchinsky
*/
#ifdef _MSC_VER
#include "utils.h"
#include <windows.h>
#include <process.h>
namespace xgboost {
namespace utils {
/*! \brief simple semaphore used for synchronization */
class Semaphore {
public :
inline void Init(int init_val) {
sem = CreateSemaphore(NULL, init_val, 10, NULL);
utils::Assert(sem != NULL, "create Semaphore error");
}
inline void Destroy(void) {
CloseHandle(sem);
}
inline void Wait(void) {
utils::Assert(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error");
}
inline void Post(void) {
utils::Assert(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error");
}
private:
HANDLE sem;
};
/*! \brief simple thread that wraps windows thread */
class Thread {
private:
HANDLE thread_handle;
unsigned thread_id;
public:
inline void Start(unsigned int __stdcall entry(void*), void *param) {
thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id);
}
inline int Join(void) {
WaitForSingleObject(thread_handle, INFINITE);
return 0;
}
};
/*! \brief exit function called from thread */
inline void ThreadExit(void *status) {
_endthreadex(0);
}
#define XGBOOST_THREAD_PREFIX unsigned int __stdcall
} // namespace utils
} // namespace xgboost
#else
// thread interface using g++
#include <semaphore.h>
#include <pthread.h>
namespace xgboost {
namespace utils {
/*!\brief semaphore class */
class Semaphore {
#ifdef __APPLE__
private:
sem_t* semPtr;
char sema_name[20];
private:
inline void GenRandomString(char *s, const int len) {
static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" ;
for (int i = 0; i < len; ++i) {
s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
}
s[len] = 0;
}
public:
inline void Init(int init_val) {
sema_name[0]='/';
sema_name[1]='s';
sema_name[2]='e';
sema_name[3]='/';
GenRandomString(&sema_name[4], 16);
if((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) {
perror("sem_open");
exit(1);
}
utils::Assert(semPtr != NULL, "create Semaphore error");
}
inline void Destroy(void) {
if (sem_close(semPtr) == -1) {
perror("sem_close");
exit(EXIT_FAILURE);
}
if (sem_unlink(sema_name) == -1) {
perror("sem_unlink");
exit(EXIT_FAILURE);
}
}
inline void Wait(void) {
sem_wait(semPtr);
}
inline void Post(void) {
sem_post(semPtr);
}
#else
private:
sem_t sem;
public:
inline void Init(int init_val) {
sem_init(&sem, 0, init_val);
}
inline void Destroy(void) {
sem_destroy(&sem);
}
inline void Wait(void) {
sem_wait(&sem);
}
inline void Post(void) {
sem_post(&sem);
}
#endif
};
/*!\brief simple thread class */
class Thread {
private:
pthread_t thread;
public :
inline void Start(void * entry(void*), void *param) {
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
pthread_create(&thread, &attr, entry, param);
}
inline int Join(void) {
void *status;
return pthread_join(thread, &status);
}
};
inline void ThreadExit(void *status) {
pthread_exit(status);
}
} // namespace utils
} // namespace xgboost
#define XGBOOST_THREAD_PREFIX void *
#endif
#endif

200
src/utils/thread_buffer.h Normal file
View File

@ -0,0 +1,200 @@
#ifndef XGBOOST_UTILS_THREAD_BUFFER_H
#define XGBOOST_UTILS_THREAD_BUFFER_H
/*!
* \file thread_buffer.h
* \brief multi-thread buffer, iterator, can be used to create parallel pipeline
* \author Tianqi Chen
*/
#include <vector>
#include <cstring>
#include <cstdlib>
#include "./utils.h"
#include "./thread.h"
namespace xgboost {
namespace utils {
/*!
* \brief buffered loading iterator that uses multithread
* this template method will assume the following paramters
* \tparam Elem elememt type to be buffered
* \tparam ElemFactory factory type to implement in order to use thread buffer
*/
template<typename Elem, typename ElemFactory>
class ThreadBuffer {
public:
/*!\brief constructor */
ThreadBuffer(void) {
this->init_end = false;
this->buf_size = 30;
}
~ThreadBuffer(void) {
if(init_end) this->Destroy();
}
/*!\brief set parameter, will also pass the parameter to factory */
inline void SetParam(const char *name, const char *val) {
if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
factory.SetParam(name, val);
}
/*!
* \brief initalize the buffered iterator
* \param param a initialize parameter that will pass to factory, ignore it if not necessary
* \return false if the initlization can't be done, e.g. buffer file hasn't been created
*/
inline bool Init(void) {
if (!factory.Init()) return false;
for (int i = 0; i < buf_size; ++i) {
bufA.push_back(factory.Create());
bufB.push_back(factory.Create());
}
this->init_end = true;
this->StartLoader();
return true;
}
/*!\brief place the iterator before first value */
inline void BeforeFirst(void) {
// wait till last loader end
loading_end.Wait();
// critcal zone
current_buf = 1;
factory.BeforeFirst();
// reset terminate limit
endA = endB = buf_size;
// wake up loader for first part
loading_need.Post();
// wait til first part is loaded
loading_end.Wait();
// set current buf to right value
current_buf = 0;
// wake loader for next part
data_loaded = false;
loading_need.Post();
// set buffer value
buf_index = 0;
}
/*! \brief destroy the buffer iterator, will deallocate the buffer */
inline void Destroy(void) {
// wait until the signal is consumed
this->destroy_signal = true;
loading_need.Post();
loader_thread.Join();
loading_need.Destroy();
loading_end.Destroy();
for (size_t i = 0; i < bufA.size(); ++i) {
factory.FreeSpace(bufA[i]);
}
for (size_t i = 0; i < bufB.size(); ++i) {
factory.FreeSpace(bufB[i]);
}
bufA.clear(); bufB.clear();
factory.Destroy();
this->init_end = false;
}
/*!
* \brief get the next element needed in buffer
* \param elem element to store into
* \return whether reaches end of data
*/
inline bool Next(Elem &elem) {
// end of buffer try to switch
if (buf_index == buf_size) {
this->SwitchBuffer();
buf_index = 0;
}
if (buf_index >= (current_buf ? endA : endB)) {
return false;
}
std::vector<Elem> &buf = current_buf ? bufA : bufB;
elem = buf[buf_index];
++buf_index;
return true;
}
/*!
* \brief get the factory object
*/
inline ElemFactory &get_factory(void) {
return factory;
}
// size of buffer
int buf_size;
private:
// factory object used to load configures
ElemFactory factory;
// index in current buffer
int buf_index;
// indicate which one is current buffer
int current_buf;
// max limit of visit, also marks termination
int endA, endB;
// double buffer, one is accessed by loader
// the other is accessed by consumer
// buffer of the data
std::vector<Elem> bufA, bufB;
// initialization end
bool init_end;
// singal whether the data is loaded
bool data_loaded;
// signal to kill the thread
bool destroy_signal;
// thread object
Thread loader_thread;
// signal of the buffer
Semaphore loading_end, loading_need;
/*!
* \brief slave thread
* this implementation is like producer-consumer style
*/
inline void RunLoader(void) {
while(!destroy_signal) {
// sleep until loading is needed
loading_need.Wait();
std::vector<Elem> &buf = current_buf ? bufB : bufA;
int i;
for (i = 0; i < buf_size ; ++i) {
if (!factory.LoadNext(buf[i])) {
int &end = current_buf ? endB : endA;
end = i; // marks the termination
break;
}
}
// signal that loading is done
data_loaded = true;
loading_end.Post();
}
}
/*!\brief entry point of loader thread */
inline static XGBOOST_THREAD_PREFIX LoaderEntry(void *pthread) {
static_cast< ThreadBuffer<Elem,ElemFactory>* >(pthread)->RunLoader();
ThreadExit(NULL);
return NULL;
}
/*!\brief start loader thread */
inline void StartLoader(void) {
destroy_signal = false;
// set param
current_buf = 1;
loading_need.Init(1);
loading_end .Init(0);
// reset terminate limit
endA = endB = buf_size;
loader_thread.Start(LoaderEntry, this);
// wait until first part of data is loaded
loading_end.Wait();
// set current buf to right value
current_buf = 0;
// wake loader for next part
data_loaded = false;
loading_need.Post();
buf_index = 0;
}
/*!\brief switch double buffer */
inline void SwitchBuffer(void) {
loading_end.Wait();
// loader shall be sleep now, critcal zone!
current_buf = !current_buf;
// wake up loader
data_loaded = false;
loading_need.Post();
}
};
} // namespace utils
} // namespace xgboost
#endif

View File

@ -3,10 +3,11 @@
import ctypes import ctypes
import os import os
# optinally have scipy sparse, though not necessary # optinally have scipy sparse, though not necessary
import numpy import numpy as np
import sys import sys
import numpy.ctypeslib import numpy.ctypeslib
import scipy.sparse as scp import scipy.sparse as scp
import random
# set this line correctly # set this line correctly
if os.name == 'nt': if os.name == 'nt':
@ -32,16 +33,28 @@ xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
def ctypes2numpy(cptr, length, dtype): def ctypes2numpy(cptr, length, dtype):
# convert a ctypes pointer array to numpy """convert a ctypes pointer array to numpy array """
assert isinstance(cptr, ctypes.POINTER(ctypes.c_float)) assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
res = numpy.zeros(length, dtype=dtype) res = numpy.zeros(length, dtype=dtype)
assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]) assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0])
return res return res
# data matrix used in xgboost
class DMatrix: class DMatrix:
"""data matrix used in xgboost"""
# constructor # constructor
def __init__(self, data, label=None, missing=0.0, weight = None): def __init__(self, data, label=None, missing=0.0, weight = None):
""" constructor of DMatrix
Args:
data: string/numpy array/scipy.sparse
data source, string type is the path of svmlight format txt file or xgb buffer
label: list or numpy 1d array, optional
label of training data
missing: float
value in data which need to be present as missing value
weight: list or numpy 1d array, optional
weight for each instances
"""
# force into void_p, mac need to pass things in as void_p # force into void_p, mac need to pass things in as void_p
if data == None: if data == None:
self.handle = None self.handle = None
@ -63,22 +76,25 @@ class DMatrix:
self.set_label(label) self.set_label(label)
if weight !=None: if weight !=None:
self.set_weight(weight) self.set_weight(weight)
# convert data from csr matrix
def __init_from_csr(self, csr): def __init_from_csr(self, csr):
"""convert data from csr matrix"""
assert len(csr.indices) == len(csr.data) assert len(csr.indices) == len(csr.data)
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR( self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR(
(ctypes.c_ulong * len(csr.indptr))(*csr.indptr), (ctypes.c_ulong * len(csr.indptr))(*csr.indptr),
(ctypes.c_uint * len(csr.indices))(*csr.indices), (ctypes.c_uint * len(csr.indices))(*csr.indices),
(ctypes.c_float * len(csr.data))(*csr.data), (ctypes.c_float * len(csr.data))(*csr.data),
len(csr.indptr), len(csr.data))) len(csr.indptr), len(csr.data)))
# convert data from numpy matrix
def __init_from_npy2d(self,mat,missing): def __init_from_npy2d(self,mat,missing):
"""convert data from numpy matrix"""
data = numpy.array(mat.reshape(mat.size), dtype='float32') data = numpy.array(mat.reshape(mat.size), dtype='float32')
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat( self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat(
data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
mat.shape[0], mat.shape[1], ctypes.c_float(missing))) mat.shape[0], mat.shape[1], ctypes.c_float(missing)))
# destructor
def __del__(self): def __del__(self):
"""destructor"""
xglib.XGDMatrixFree(self.handle) xglib.XGDMatrixFree(self.handle)
def get_float_info(self, field): def get_float_info(self, field):
length = ctypes.c_ulong() length = ctypes.c_ulong()
@ -96,16 +112,39 @@ class DMatrix:
def set_uint_info(self, field, data): def set_uint_info(self, field, data):
xglib.XGDMatrixSetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')), xglib.XGDMatrixSetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
(ctypes.c_uint*len(data))(*data), len(data)) (ctypes.c_uint*len(data))(*data), len(data))
# load data from file
def save_binary(self, fname, silent=True): def save_binary(self, fname, silent=True):
"""save DMatrix to XGBoost buffer
Args:
fname: string
name of buffer file
slient: bool, option
whether print info
Returns:
None
"""
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent)) xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
# set label of dmatrix
def set_label(self, label): def set_label(self, label):
"""set label of dmatrix
Args:
label: list
label for DMatrix
Returns:
None
"""
self.set_float_info('label', label) self.set_float_info('label', label)
# set weight of each instances
def set_weight(self, weight): def set_weight(self, weight):
"""set weight of each instances
Args:
weight: float
weight for positive instance
Returns:
None
"""
self.set_float_info('weight', weight) self.set_float_info('weight', weight)
# set initialized margin prediction
def set_base_margin(self, margin): def set_base_margin(self, margin):
""" """
set base margin of booster to start from set base margin of booster to start from
@ -116,31 +155,143 @@ class DMatrix:
see also example/demo.py see also example/demo.py
""" """
self.set_float_info('base_margin', margin) self.set_float_info('base_margin', margin)
# set group size of dmatrix, used for rank
def set_group(self, group): def set_group(self, group):
"""set group size of dmatrix, used for rank
Args:
group:
Returns:
None
"""
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group)) xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group))
# get label from dmatrix
def get_label(self): def get_label(self):
"""get label from dmatrix
Args:
None
Returns:
list, label of data
"""
return self.get_float_info('label') return self.get_float_info('label')
# get weight from dmatrix
def get_weight(self): def get_weight(self):
"""get weight from dmatrix
Args:
None
Returns:
float, weight
"""
return self.get_float_info('weight') return self.get_float_info('weight')
# get base_margin from dmatrix
def get_base_margin(self): def get_base_margin(self):
"""get base_margin from dmatrix
Args:
None
Returns:
float, base margin
"""
return self.get_float_info('base_margin') return self.get_float_info('base_margin')
def num_row(self): def num_row(self):
"""get number of rows
Args:
None
Returns:
int, num rows
"""
return xglib.XGDMatrixNumRow(self.handle) return xglib.XGDMatrixNumRow(self.handle)
# slice the DMatrix to return a new DMatrix that only contains rindex
def slice(self, rindex): def slice(self, rindex):
"""slice the DMatrix to return a new DMatrix that only contains rindex
Args:
rindex: list
list of index to be chosen
Returns:
res: DMatrix
new DMatrix with chosen index
"""
res = DMatrix(None) res = DMatrix(None)
res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix( res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix(
self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex))) self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex)))
return res return res
class CVPack:
def __init__(self, dtrain, dtest, param):
self.dtrain = dtrain
self.dtest = dtest
self.watchlist = watchlist = [ (dtrain,'train'), (dtest, 'test') ]
self.bst = Booster(param, [dtrain,dtest])
def update(self,r):
self.bst.update(self.dtrain, r)
def eval(self,r):
return self.bst.eval_set(self.watchlist, r)
def mknfold(dall, nfold, param, seed, weightscale=None):
"""
mk nfold list of cvpack from randidx
"""
randidx = range(dall.num_row())
random.seed(seed)
random.shuffle(randidx)
idxset = []
kstep = len(randidx) / nfold
for i in range(nfold):
idxset.append(randidx[ (i*kstep) : min(len(randidx),(i+1)*kstep) ])
ret = []
for k in range(nfold):
trainlst = []
for j in range(nfold):
if j == k:
testlst = idxset[j]
else:
trainlst += idxset[j]
dtrain = dall.slice(trainlst)
dtest = dall.slice(testlst)
# rescale weight of dtrain and dtest
if weightscale != None:
dtrain.set_weight( dtrain.get_weight() * weightscale * dall.num_row() / dtrain.num_row() )
dtest.set_weight( dtest.get_weight() * weightscale * dall.num_row() / dtest.num_row() )
ret.append(CVPack(dtrain, dtest, param))
return ret
def aggcv(rlist):
"""
aggregate cross validation results
"""
cvmap = {}
arr = rlist[0].split()
ret = arr[0]
for it in arr[1:]:
k, v = it.split(':')
cvmap[k] = [float(v)]
for line in rlist[1:]:
arr = line.split()
assert ret == arr[0]
for it in arr[1:]:
k, v = it.split(':')
cvmap[k].append(float(v))
for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
v = np.array(v)
ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
return ret
class Booster: class Booster:
"""learner class """ """learner class """
def __init__(self, params={}, cache=[], model_file = None): def __init__(self, params={}, cache=[], model_file = None):
""" constructor, param: """ """ constructor
Args:
params: dict
params for boosters
cache: list
list of cache item
model_file: string
path of model file
Returns:
None
"""
for d in cache: for d in cache:
assert isinstance(d, DMatrix) assert isinstance(d, DMatrix)
dmats = (ctypes.c_void_p * len(cache))(*[ d.handle for d in cache]) dmats = (ctypes.c_void_p * len(cache))(*[ d.handle for d in cache])
@ -166,16 +317,30 @@ class Booster:
xglib.XGBoosterSetParam( xglib.XGBoosterSetParam(
self.handle, ctypes.c_char_p(k.encode('utf-8')), self.handle, ctypes.c_char_p(k.encode('utf-8')),
ctypes.c_char_p(str(v).encode('utf-8'))) ctypes.c_char_p(str(v).encode('utf-8')))
def update(self, dtrain, it): def update(self, dtrain, it):
""" """
update update
dtrain: the training DMatrix Args:
it: current iteration number dtrain: DMatrix
the training DMatrix
it: int
current iteration number
Returns:
None
""" """
assert isinstance(dtrain, DMatrix) assert isinstance(dtrain, DMatrix)
xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle) xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle)
def boost(self, dtrain, grad, hess): def boost(self, dtrain, grad, hess):
""" update """ """ update
Args:
dtrain: DMatrix
the training DMatrix
grad: list
the first order of gradient
hess: list
the second order of gradient
"""
assert len(grad) == len(hess) assert len(grad) == len(hess)
assert isinstance(dtrain, DMatrix) assert isinstance(dtrain, DMatrix)
xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle, xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle,
@ -183,6 +348,14 @@ class Booster:
(ctypes.c_float*len(hess))(*hess), (ctypes.c_float*len(hess))(*hess),
len(grad)) len(grad))
def eval_set(self, evals, it = 0): def eval_set(self, evals, it = 0):
"""evaluates by metric
Args:
evals: list of tuple (DMatrix, string)
lists of items to be evaluated
it: int
Returns:
evals result
"""
for d in evals: for d in evals:
assert isinstance(d[0], DMatrix) assert isinstance(d[0], DMatrix)
assert isinstance(d[1], str) assert isinstance(d[1], str)
@ -195,21 +368,46 @@ class Booster:
def predict(self, data, output_margin=False): def predict(self, data, output_margin=False):
""" """
predict with data predict with data
data: the dmatrix storing the input Args:
output_margin: whether output raw margin value that is untransformed data: DMatrix
the dmatrix storing the input
output_margin: bool
whether output raw margin value that is untransformed
Returns:
numpy array of prediction
""" """
length = ctypes.c_ulong() length = ctypes.c_ulong()
preds = xglib.XGBoosterPredict(self.handle, data.handle, preds = xglib.XGBoosterPredict(self.handle, data.handle,
int(output_margin), ctypes.byref(length)) int(output_margin), ctypes.byref(length))
return ctypes2numpy(preds, length.value, 'float32') return ctypes2numpy(preds, length.value, 'float32')
def save_model(self, fname): def save_model(self, fname):
""" save model to file """ """ save model to file
Args:
fname: string
file name of saving model
Returns:
None
"""
xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8'))) xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))
def load_model(self, fname): def load_model(self, fname):
"""load model from file""" """load model from file
Args:
fname: string
file name of saving model
Returns:
None
"""
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) ) xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
def dump_model(self, fo, fmap=''): def dump_model(self, fo, fmap=''):
"""dump model into text file""" """dump model into text file
Args:
fo: string
file name to be dumped
fmap: string, optional
file name of feature map names
Returns:
None
"""
if isinstance(fo,str): if isinstance(fo,str):
fo = open(fo,'w') fo = open(fo,'w')
need_close = True need_close = True
@ -248,7 +446,17 @@ class Booster:
return fmap return fmap
def evaluate(bst, evals, it, feval = None): def evaluate(bst, evals, it, feval = None):
"""evaluation on eval set""" """evaluation on eval set
Args:
bst: XGBoost object
object of XGBoost model
evals: list of tuple (DMatrix, string)
obj need to be evaluated
it: int
feval: optional
Returns:
eval result
"""
if feval != None: if feval != None:
res = '[%d]' % it res = '[%d]' % it
for dm, evname in evals: for dm, evname in evals:
@ -259,8 +467,22 @@ def evaluate(bst, evals, it, feval = None):
return res return res
def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None): def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None):
""" train a booster with given paramaters """ """ train a booster with given paramaters
Args:
params: dict
params of booster
dtrain: DMatrix
data to be trained
num_boost_round: int
num of round to be boosted
evals: list
list of items to be evaluated
obj:
feval:
"""
bst = Booster(params, [dtrain]+[ d[0] for d in evals ] ) bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
if obj == None: if obj == None:
for i in range(num_boost_round): for i in range(num_boost_round):
@ -276,3 +498,27 @@ def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None
if len(evals) != 0: if len(evals) != 0:
sys.stderr.write(evaluate(bst, evals, i, feval)+'\n') sys.stderr.write(evaluate(bst, evals, i, feval)+'\n')
return bst return bst
def cv(params, dtrain, num_boost_round = 10, nfold=3, evals = [], obj=None, feval=None):
""" cross validation with given paramaters
Args:
params: dict
params of booster
dtrain: DMatrix
data to be trained
num_boost_round: int
num of round to be boosted
nfold: int
folds to do cv
evals: list
list of items to be evaluated
obj:
feval:
"""
plst = list(params.items())+[('eval_metric', itm) for itm in evals]
cvfolds = mknfold(dtrain, nfold, plst, 0)
for i in range(num_boost_round):
for f in cvfolds:
f.update(i)
res = aggcv([f.eval(i) for f in cvfolds])
sys.stderr.write(res+'\n')