Merge remote-tracking branch 'dmlc/master'

This commit is contained in:
El Potaeto 2015-04-15 18:48:26 +02:00
commit de3f74f755
118 changed files with 1305 additions and 11320 deletions

View File

@ -16,18 +16,28 @@ ifeq ($(cxx11),1)
else
endif
ifeq ($(hdfs),1)
CFLAGS+= -DRABIT_USE_HDFS=1 -I$(HADOOP_HDFS_HOME)/include -I$(JAVA_HOME)/include
LDFLAGS+= -L$(HADOOP_HDFS_HOME)/lib/native -L$(JAVA_HOME)/jre/lib/amd64/server -lhdfs -ljvm
# handling dmlc
ifdef dmlc
ifndef config
ifneq ("$(wildcard $(dmlc)/config.mk)","")
config = $(dmlc)/config.mk
else
CFLAGS+= -DRABIT_USE_HDFS=0
config = $(dmlc)/make/config.mk
endif
endif
include $(config)
include $(dmlc)/make/dmlc.mk
LDFLAGS+= $(DMLC_LDFLAGS)
LIBDMLC=$(dmlc)/libdmlc.a
else
LIBDMLC=dmlc_simple.o
endif
# specify tensor path
BIN = xgboost
MOCKBIN = xgboost.mock
OBJ = updater.o gbm.o io.o main.o
MPIBIN = xgboost.mpi
OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
MPIBIN =
SLIB = wrapper/libxgboostwrapper.so
.PHONY: clean all mpi python Rpack
@ -38,23 +48,22 @@ mpi: $(MPIBIN)
python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
dmlc_simple.o: src/io/dmlc_simple.cpp src/utils/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
xgboost.mpi: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mpi.a
xgboost.mock: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mock.a
xgboost: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit.a
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o subtree/rabit/lib/librabit.a
xgboost: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit.a $(LIBDMLC)
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o subtree/rabit/lib/librabit.a $(LIBDMLC)
# dependency on rabit
subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
cd subtree/rabit;make lib/librabit.a; cd ../..
+ cd subtree/rabit;make lib/librabit.a; cd ../..
subtree/rabit/lib/librabit_empty.a: subtree/rabit/src/engine_empty.cc
cd subtree/rabit;make lib/librabit_empty.a; cd ../..
+ cd subtree/rabit;make lib/librabit_empty.a; cd ../..
subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc
cd subtree/rabit;make lib/librabit_mock.a; cd ../..
+ cd subtree/rabit;make lib/librabit_mock.a; cd ../..
subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
+ cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
$(BIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)

View File

@ -18,7 +18,12 @@ License: Apache License (== 2.0) | file LICENSE
URL: https://github.com/dmlc/xgboost
BugReports: https://github.com/dmlc/xgboost/issues
VignetteBuilder: knitr
Suggests: knitr
Suggests:
knitr,
ggplot2 (>= 1.0.0),
DiagrammeR (>= 0.4),
Ckmeans.1d.dp (>= 3.3.1),
vcd (>= 1.3)
Depends:
R (>= 2.10)
Imports:
@ -26,8 +31,4 @@ Imports:
methods,
data.table (>= 1.9.4),
magrittr (>= 1.5),
stringr (>= 0.6.2),
DiagrammeR (>= 0.4),
ggplot2 (>= 1.0.0),
Ckmeans.1d.dp (>= 3.3.1),
vcd (>= 1.3)
stringr (>= 0.6.2)

View File

@ -21,8 +21,6 @@ exportMethods(predict)
import(methods)
importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dgeMatrix)
importFrom(Ckmeans.1d.dp,Ckmeans.1d.dp)
importFrom(DiagrammeR,mermaid)
importFrom(Matrix,cBind)
importFrom(Matrix,colSums)
importFrom(Matrix,sparseVector)
@ -34,16 +32,6 @@ importFrom(data.table,fread)
importFrom(data.table,rbindlist)
importFrom(data.table,set)
importFrom(data.table,setnames)
importFrom(ggplot2,aes)
importFrom(ggplot2,coord_flip)
importFrom(ggplot2,element_blank)
importFrom(ggplot2,element_text)
importFrom(ggplot2,geom_bar)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,ggtitle)
importFrom(ggplot2,theme)
importFrom(ggplot2,xlab)
importFrom(ggplot2,ylab)
importFrom(magrittr,"%>%")
importFrom(magrittr,add)
importFrom(magrittr,not)

View File

@ -36,8 +36,8 @@ xgb.setinfo <- function(dmat, name, info) {
return(TRUE)
}
if (name == "group") {
if (length(info)!=xgb.numrow(dmat))
stop("The length of groups must equal to the number of rows in the input data")
if (sum(info)!=xgb.numrow(dmat))
stop("The sum of groups must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
PACKAGE = "xgboost")
return(TRUE)
@ -77,9 +77,9 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
}
# convert xgb.Booster.handle to xgb.Booster
xgb.handleToBooster <- function(handle)
xgb.handleToBooster <- function(handle, raw = NULL)
{
bst <- list(handle = handle, raw = NULL)
bst <- list(handle = handle, raw = raw)
class(bst) <- "xgb.Booster"
return(bst)
}
@ -87,8 +87,12 @@ xgb.handleToBooster <- function(handle)
# Check whether an xgb.Booster object is complete
xgb.Booster.check <- function(bst, saveraw = TRUE)
{
if (is.null(bst$handle)) {
bst$handle <- xgb.load(bst$raw)
isnull <- is.null(bst$handle)
if (!isnull) {
isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
}
if (isnull) {
bst$handle <- xgb.Booster(modelfile = bst$raw)
} else {
if (is.null(bst$raw) && saveraw)
bst$raw <- xgb.save.raw(bst$handle)

View File

@ -95,6 +95,17 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
}
folds <- xgb.cv.mknfold(dtrain, nfold, params)
obj_type = params[['objective']]
mat_pred = FALSE
if (!is.null(obj_type) && obj_type=='multi:softprob')
{
num_class = params[['num_class']]
if (is.null(num_class))
stop('must set num_class to use softmax')
predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
mat_pred = TRUE
}
else
predictValues <- rep(0,xgb.numrow(dtrain))
history <- c()
for (i in 1:nrounds) {
@ -102,14 +113,23 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
for (k in 1:nfold) {
fd <- folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
if (i<nrounds) {
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
} else {
if (!prediction) {
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
} else {
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
if (mat_pred) {
pred_mat = matrix(res[[2]],num_class,length(fd$index))
predictValues[fd$index,] <- t(pred_mat)
} else {
predictValues[fd$index] <- res[[2]]
}
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
}
}
}
ret <- xgb.cv.aggcv(msg, showsd)
history <- c(history, ret)
if(verbose) paste(ret, "\n", sep="") %>% cat

View File

@ -21,7 +21,12 @@ xgb.load <- function(modelfile) {
stop("xgb.load: modelfile cannot be NULL")
handle <- xgb.Booster(modelfile = modelfile)
bst <- xgb.handleToBooster(handle)
# re-use modelfile if it is raw so we donot need to serialize
if (typeof(modelfile) == "raw") {
bst <- xgb.handleToBooster(handle, modelfile)
} else {
bst <- xgb.handleToBooster(handle, NULL)
}
bst <- xgb.Booster.check(bst)
return(bst)
}

View File

@ -96,6 +96,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
allTrees <- data.table()
anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
for(i in 1:n_round){
tree <- text[(position[i]+1):(position[i+1]-1)]
@ -115,7 +116,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
featureBranch <- feature_names[featureBranch + 1]
}
featureLeaf <- rep("Leaf", length(leaf))
splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "")
splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "")
splitLeaf <- rep(NA, length(leaf))
yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
yesLeaf <- rep(NA, length(leaf))
@ -123,8 +124,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
noLeaf <- rep(NA, length(leaf))
missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
missingLeaf <- rep(NA, length(leaf))
qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*")
qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*")
qualityBranch <- extract(branch, paste0("gain=",anynumber_regex))
qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]

View File

@ -2,17 +2,6 @@
#'
#' Read a data.table containing feature importance details and plot it.
#'
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 geom_bar
#' @importFrom ggplot2 coord_flip
#' @importFrom ggplot2 xlab
#' @importFrom ggplot2 ylab
#' @importFrom ggplot2 ggtitle
#' @importFrom ggplot2 theme
#' @importFrom ggplot2 element_text
#' @importFrom ggplot2 element_blank
#' @importFrom Ckmeans.1d.dp Ckmeans.1d.dp
#' @importFrom magrittr %>%
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
#' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
@ -44,11 +33,17 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
if (!"data.table" %in% class(importance_matrix)) {
stop("importance_matrix: Should be a data.table.")
}
if (!require(ggplot2, quietly = TRUE)) {
stop("ggplot2 package is required for plotting the importance", call. = FALSE)
}
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
}
# To avoid issues in clustering when co-occurences are used
importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
clusters <- suppressWarnings(Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() )

View File

@ -15,7 +15,6 @@
#' @importFrom stringr str_split
#' @importFrom stringr str_extract
#' @importFrom stringr str_trim
#' @importFrom DiagrammeR mermaid
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
@ -65,6 +64,10 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
}
if(is.null(model)){
allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
} else {
@ -85,7 +88,7 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
mermaid(path, width, height)
DiagrammeR::mermaid(path, width, height)
}
# Avoid error messages during CRAN check.

View File

@ -4,4 +4,5 @@ PKGROOT=../../
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o

View File

@ -15,5 +15,5 @@ xgblib:
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
$(OBJECTS) : xgblib

View File

@ -59,6 +59,9 @@ inline void _WrapperEnd(void) {
}
extern "C" {
SEXP XGCheckNullPtr_R(SEXP handle) {
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
}
void _DMatrixFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGDMatrixFree(R_ExternalPtrAddr(ext));

View File

@ -11,6 +11,12 @@ extern "C" {
}
extern "C" {
/*!
* \brief check whether a handle is NULL
* \param handle
* \return whether it is null ptr
*/
SEXP XGCheckNullPtr_R(SEXP handle);
/*!
* \brief load a data matrix
* \param fname name of the content

View File

@ -1,5 +1,4 @@
#!/usr/bin/python
import sys
def loadfmap( fname ):
fmap = {}

View File

@ -1,10 +1,6 @@
#!/usr/bin/python
import sys
import numpy as np
import scipy.sparse
# append the path to xgboost, you may need to change the following line
# alternatively, you can add the path to PYTHONPATH environment variable
sys.path.append('../../wrapper')
import xgboost as xgb
### simple example

View File

@ -1,7 +1,5 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
dtrain = xgb.DMatrix('../data/agaricus.txt.train')

View File

@ -1,7 +1,5 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training

View File

@ -1,7 +1,5 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
###
# advanced: cutomsized loss function

View File

@ -1,6 +1,4 @@
#!/usr/bin/python
import sys
sys.path.append('../../wrapper')
import xgboost as xgb
##
# this script demonstrate how to fit generalized linear model in xgboost

View File

@ -1,7 +1,5 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training

View File

@ -1,7 +1,5 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training

View File

@ -0,0 +1,62 @@
'''
Created on 1 Apr 2015
@author: Jamie Hall
'''
import xgboost as xgb
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
rng = np.random.RandomState(31337)
print("Zeros and Ones from the Digits dataset: binary classification")
digits = load_digits(2)
y = digits['target']
X = digits['data']
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print(confusion_matrix(actuals, predictions))
print("Iris: multiclass classification")
iris = load_iris()
y = iris['target']
X = iris['data']
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print(confusion_matrix(actuals, predictions))
print("Boston Housing: regression")
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print(mean_squared_error(actuals, predictions))
print("Parameter optimization")
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
{'max_depth': [2,4,6],
'n_estimators': [50,100,200]}, verbose=1)
clf.fit(X,y)
print(clf.best_score_)
print(clf.best_params_)

View File

@ -1,7 +1,5 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training

View File

@ -1,14 +1,6 @@
#!/usr/bin/python
# this is the example script to use xgboost to train
import inspect
import os
import sys
import numpy as np
# add path of xgboost python module
code_path = os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../wrapper")
sys.path.append(code_path)
import xgboost as xgb

View File

@ -1,9 +1,6 @@
#!/usr/bin/python
# make prediction
import sys
import numpy as np
# add path of xgboost python module
sys.path.append('../../wrapper/')
import xgboost as xgb
# path to where the data lies

View File

@ -1,9 +1,6 @@
#!/usr/bin/python
# this is the example script to use xgboost to train
import sys
import numpy as np
# add path of xgboost python module
sys.path.append('../../wrapper/')
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
import time

View File

@ -1,43 +0,0 @@
require(xgboost)
require(methods)
train = read.csv('data/train.csv',header=TRUE,stringsAsFactors = F)
test = read.csv('data/test.csv',header=TRUE,stringsAsFactors = F)
train = train[,-1]
test = test[,-1]
y = train[,ncol(train)]
y = gsub('Class_','',y)
y = as.integer(y)-1 #xgboost take features in [0,numOfClass)
x = rbind(train[,-ncol(train)],test)
x = as.matrix(x)
x = matrix(as.numeric(x),nrow(x),ncol(x))
trind = 1:length(y)
teind = (nrow(train)+1):nrow(x)
# Set necessary parameter
param <- list("objective" = "multi:softprob",
"eval_metric" = "mlogloss",
"num_class" = 9,
"nthread" = 8)
# Run Cross Valication
cv.nround = 50
bst.cv = xgb.cv(param=param, data = x[trind,], label = y,
nfold = 3, nrounds=cv.nround)
# Train the model
nround = 50
bst = xgboost(param=param, data = x[trind,], label = y, nrounds=nround)
# Make prediction
pred = predict(bst,x[teind,])
pred = matrix(pred,9,length(pred)/9)
pred = t(pred)
# Output submission
pred = format(pred, digits=2,scientific=F) # shrink the size of submission
pred = data.frame(1:nrow(pred),pred)
names(pred) = c('id', paste0('Class_',1:9))
write.csv(pred,file='submission.csv', quote=FALSE,row.names=FALSE)

View File

@ -1,7 +1,5 @@
#! /usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper/')
import xgboost as xgb
# label need to be 0 to num_class -1

View File

@ -1,5 +1,4 @@
#!/usr/bin/python
import sys
fo = open( 'machine.txt', 'w' )
cnt = 6

View File

@ -1,17 +1,10 @@
Distributed XGBoost
======
This folder contains information of Distributed XGBoost (Distributed GBDT).
Distributed XGBoost is now part of [Wormhole](https://github.com/dmlc/wormhole).
Checkout this [Link](https://github.com/dmlc/wormhole/tree/master/learn/xgboost) for usage examples, build and job submissions.
* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/dmlc/rabit)
- Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning
- This makes xgboost portable and fault-tolerant against node failures
* You can run Distributed XGBoost on platforms including Hadoop(see [hadoop folder](hadoop)) and MPI
- Rabit only replies a platform to start the programs, so it should be easy to port xgboost to most platforms
Build
=====
* In the root folder, type ```make```
- If you have C++11 compiler, it is recommended to use ```make cxx11=1```
Notes
====
@ -27,11 +20,9 @@ Notes
Solvers
=====
There are two solvers in distributed xgboost. You can check for local demo of the two solvers, see [row-split](row-split) and [col-split](col-split)
* Column-based solver split data by column, each node work on subset of columns,
it uses exactly the same algorithm as single node version.
* Row-based solver split data by row, each node work on subset of rows,
it uses an approximate histogram count algorithm, and will only examine subset of
potential split points as opposed to all split points.
- This is the mode used by current hadoop version, since usually data was stored by rows in many industry system

View File

@ -1,40 +0,0 @@
Distributed XGBoost: Hadoop Yarn Version
====
* The script in this fold shows an example of how to run distributed xgboost on hadoop platform with YARN
* It relies on [Rabit Library](https://github.com/dmlc/rabit) (Reliable Allreduce and Broadcast Interface) and Yarn. Rabit provides an interface to aggregate gradient values and split statistics, that allow xgboost to run reliably on hadoop. You do not need to care how to update model in each iteration, just use the script ```rabit_yarn.py```. For those who want to know how it exactly works, plz refer to the main page of [Rabit](https://github.com/dmlc/rabit).
* Quick start: run ```bash run_mushroom.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
- This is the hadoop version of binary classification example in the demo folder.
- More info of the usage of xgboost can be refered to [wiki page](https://github.com/dmlc/xgboost/wiki)
Before you run the script
====
* Make sure you have set up the hadoop environment.
- Check variable $HADOOP_PREFIX exists (e.g. run ```echo $HADOOP_PREFIX```)
- Compile xgboost with hdfs support by typing ```make hdfs=1```
How to Use
====
* Input data format: LIBSVM format. The example here uses generated data in demo/data folder.
* Put the training data in HDFS (hadoop distributed file system).
* Use rabit ```rabit_yarn.py``` to submit training task to yarn
* Get the final model file from HDFS, and locally do prediction as well as visualization of model.
Single machine vs Hadoop version
====
If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file.
* IO: instead of reading and writing file locally, we now use HDFS, put ```hdfs://``` prefix to the address of file you like to access
* File cache: ```rabit_yarn.py``` also provide several ways to cache necesary files, including binary file (xgboost), conf file
- ```rabit_yarn.py``` will automatically cache files in the command line. For example, ```rabit_yarn.py -n 3 $localPath/xgboost mushroom.hadoop.conf``` will cache "xgboost" and "mushroom.hadoop.conf".
- You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2``` or ```-f file1#file2``` (use "#" to spilt file names).
- The local path of cached files in command is "./".
- Since the cached files will be packaged and delivered to hadoop slave nodes, the cached file should not be large.
* Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train".
* More details of submission can be referred to the usage of ```rabit_yarn.py```.
* The model saved by hadoop version is compatible with single machine version.
Notes
====
* The code has been tested on YARN.
* The code is optimized with multi-threading, so you will want to run one xgboost per node/worker for best performance.
- You will want to set <n_thread_per_worker> to be number of cores you have on each machine.
* It is also possible to submit job with hadoop streaming, however, YARN is highly recommended for efficiency reason

View File

@ -1,36 +0,0 @@
# General Parameters, see comment for each definition
# choose the booster, can be gbtree or gblinear
booster = gbtree
# choose logistic regression loss function for binary classification
objective = binary:logistic
# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3
# Task Parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
# evaluate on training data as well each round
# eval_train = 1
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
# eval[test] = "agaricus.txt.test"
# Plz donot modify the following parameters
# The path of training data, with prefix hdfs
#data = hdfs:/data/
# The path of model file
#model_out =
# split pattern of xgboost
dsplit = row
# evaluate on training data as well each round
eval_train = 1

View File

@ -1,28 +0,0 @@
#!/bin/bash
if [ "$#" -lt 3 ];
then
echo "Usage: <nworkers> <nthreads> <path_in_HDFS>"
exit -1
fi
# put the local training file to HDFS
hadoop fs -mkdir $3/data
hadoop fs -put ../../demo/data/agaricus.txt.train $3/data
hadoop fs -put ../../demo/data/agaricus.txt.test $3/data
# running rabit, pass address in hdfs
../../subtree/rabit/tracker/rabit_yarn.py -n $1 --vcores $2 ../../xgboost mushroom.hadoop.conf nthread=$2\
data=hdfs://$3/data/agaricus.txt.train\
eval[test]=hdfs://$3/data/agaricus.txt.test\
model_out=hdfs://$3/mushroom.final.model
# get the final model file
hadoop fs -get $3/mushroom.final.model final.model
# output prediction task=pred
../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../../demo/data/agaricus.txt.test
# print the boosters of final.model in dump.raw.txt
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
# use the feature map in printing for better visualization
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
cat dump.nice.txt

View File

@ -1,18 +0,0 @@
Distributed XGBoost: Row Split Version
====
* You might be interested to checkout the [Hadoop example](../hadoop)
* Machine Rabit: run ```bash machine-row-rabit.sh <n-mpi-process>```
- machine-col-rabit.sh starts xgboost job using rabit
How to Use
====
* First split the data by rows
* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
* Enable ow split mode by ```dsplit=row```
Notes
====
* The code is multi-threaded, so you want to run one xgboost-mpi per node
* Row-based solver split data by row, each node work on subset of rows, it uses an approximate histogram count algorithm,
and will only examine subset of potential split points as opposed to all split points.

View File

@ -1,20 +0,0 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf train-machine.row* *.model
k=$1
# make machine data
cd ../../demo/regression/
python mapfeat.py
python mknfold.py machine.txt 1
cd -
# split the lib svm file into k subfiles
python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
# run xgboost mpi
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 mock=0,0,3,0 mock=2,2,3,0

View File

@ -1,24 +0,0 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf train-machine.row* *.model
k=$1
# make machine data
cd ../../demo/regression/
python mapfeat.py
python mknfold.py machine.txt 1
cd -
# split the lib svm file into k subfiles
python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
# run xgboost mpi
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=3 eval_train=1
# run xgboost-mpi save model 0001, continue to run from existing model
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=1
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model

View File

@ -1,30 +0,0 @@
# General Parameters, see comment for each definition
# choose the tree booster, can also change to gblinear
booster = gbtree
# this is the only difference with classification, use reg:linear to do linear classification
# when labels are in [0,1] we can also use reg:logistic
objective = reg:linear
# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3
# Task parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
use_buffer = 0
# The path of training data
data = "train-machine.row%d"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "../../demo/regression/machine.txt.test"
# The path of test data
test:data = "../../demo/regression/machine.txt.test"

View File

@ -1,24 +0,0 @@
#!/usr/bin/python
import sys
import random
# split libsvm file into different rows
if len(sys.argv) < 4:
print ('Usage:<fin> <fo> k')
exit(0)
random.seed(10)
k = int(sys.argv[3])
fi = open( sys.argv[1], 'r' )
fos = []
for i in range(k):
fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
for l in open(sys.argv[1]):
i = random.randint(0, k-1)
fos[i].write(l)
for f in fos:
f.close()

View File

@ -206,6 +206,10 @@ class GBTree : public IGradBooster {
for (size_t i = 0; i < trees.size(); ++i) {
delete trees[i];
}
for (size_t i = 0; i < updaters.size(); ++i) {
delete updaters[i];
}
updaters.clear();
trees.clear();
pred_buffer.clear();
pred_counter.clear();
@ -444,12 +448,12 @@ class GBTree : public IGradBooster {
int reserved[31];
/*! \brief constructor */
ModelParam(void) {
std::memset(this, 0, sizeof(ModelParam));
num_trees = 0;
num_roots = num_feature = 0;
num_pbuffer = 0;
num_output_group = 1;
size_leaf_vector = 0;
std::memset(reserved, 0, sizeof(reserved));
}
/*!
* \brief set parameters from outside

127
src/io/dmlc_simple.cpp Normal file
View File

@ -0,0 +1,127 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX
#include "../utils/io.h"
// implements a single no split version of DMLC
// in case we want to avoid dependency on dmlc-core
namespace xgboost {
namespace utils {
class SingleFileSplit : public dmlc::InputSplit {
public:
explicit SingleFileSplit(const char *fname)
: use_stdin_(false) {
if (!std::strcmp(fname, "stdin")) {
#ifndef XGBOOST_STRICT_CXX98_
use_stdin_ = true; fp_ = stdin;
#endif
}
if (!use_stdin_) {
fp_ = utils::FopenCheck(fname, "r");
}
end_of_file_ = false;
}
virtual ~SingleFileSplit(void) {
if (!use_stdin_) std::fclose(fp_);
}
virtual bool ReadRecord(std::string *out_data) {
if (end_of_file_) return false;
out_data->clear();
while (true) {
char c = std::fgetc(fp_);
if (c == EOF) {
end_of_file_ = true;
}
if (c != '\r' && c != '\n' && c != EOF) {
*out_data += c;
} else {
if (out_data->length() != 0) return true;
if (end_of_file_) return false;
}
}
return false;
}
private:
std::FILE *fp_;
bool use_stdin_;
bool end_of_file_;
};
class StdFile : public dmlc::Stream {
public:
explicit StdFile(const char *fname, const char *mode)
: use_stdio(false) {
using namespace std;
#ifndef XGBOOST_STRICT_CXX98_
if (!strcmp(fname, "stdin")) {
use_stdio = true; fp = stdin;
}
if (!strcmp(fname, "stdout")) {
use_stdio = true; fp = stdout;
}
#endif
if (!strncmp(fname, "file://", 7)) fname += 7;
if (!use_stdio) {
std::string flag = mode;
if (flag == "w") flag = "wb";
if (flag == "r") flag = "rb";
fp = utils::FopenCheck(fname, flag.c_str());
}
}
virtual ~StdFile(void) {
this->Close();
}
virtual size_t Read(void *ptr, size_t size) {
return std::fread(ptr, 1, size, fp);
}
virtual void Write(const void *ptr, size_t size) {
std::fwrite(ptr, size, 1, fp);
}
virtual void Seek(size_t pos) {
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
}
virtual size_t Tell(void) {
return std::ftell(fp);
}
virtual bool AtEnd(void) const {
return std::feof(fp) != 0;
}
inline void Close(void) {
if (fp != NULL && !use_stdio) {
std::fclose(fp); fp = NULL;
}
}
private:
std::FILE *fp;
bool use_stdio;
};
} // namespace utils
} // namespace xgboost
namespace dmlc {
InputSplit* InputSplit::Create(const char *uri,
unsigned part,
unsigned nsplit) {
using namespace xgboost;
const char *msg = "xgboost is compiled in local mode\n"\
"to use hdfs, s3 or distributed version, compile with make dmlc=1";
utils::Check(strncmp(uri, "s3://", 5) != 0, msg);
utils::Check(strncmp(uri, "hdfs://", 7) != 0, msg);
utils::Check(nsplit == 1, msg);
return new utils::SingleFileSplit(uri);
}
Stream *Stream::Create(const char *uri, const char * const flag) {
using namespace xgboost;
const char *msg = "xgboost is compiled in local mode\n"\
"to use hdfs, s3 or distributed version, compile with make dmlc=1";
utils::Check(strncmp(uri, "s3://", 5) != 0, msg);
utils::Check(strncmp(uri, "hdfs://", 7) != 0, msg);
return new utils::StdFile(uri, flag);
}
} // namespace dmlc

View File

@ -16,7 +16,10 @@ namespace xgboost {
namespace io {
DataMatrix* LoadDataMatrix(const char *fname, bool silent,
bool savebuffer, bool loadsplit) {
if (!std::strcmp(fname, "stdin") || loadsplit) {
if (!std::strcmp(fname, "stdin") ||
!std::strncmp(fname, "s3://", 5) ||
!std::strncmp(fname, "hdfs://", 7) ||
loadsplit) {
DMatrixSimple *dmat = new DMatrixSimple();
dmat->LoadText(fname, silent, loadsplit);
return dmat;

View File

@ -90,11 +90,11 @@ class DMatrixSimple : public DataMatrix {
rank = rabit::GetRank();
npart = rabit::GetWorldSize();
}
rabit::io::InputSplit *in =
rabit::io::CreateInputSplit(uri, rank, npart);
dmlc::InputSplit *in =
dmlc::InputSplit::Create(uri, rank, npart);
this->Clear();
std::string line;
while (in->NextLine(&line)) {
while (in->ReadRecord(&line)) {
float label;
std::istringstream ss(line);
std::vector<RowBatch::Entry> feats;

View File

@ -192,8 +192,10 @@ class FMatrixS : public IFMatrix{
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ncol; ++i) {
std::sort(&col_data_[0] + col_ptr_[i],
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
if (col_ptr_[i] < col_ptr_[i + 1]) {
std::sort(BeginPtr(col_data_) + col_ptr_[i],
BeginPtr(col_data_) + col_ptr_[i + 1], Entry::CmpValue);
}
}
}

View File

@ -83,7 +83,15 @@ struct EvalLogLoss : public EvalEWiseBase<EvalLogLoss> {
return "logloss";
}
inline static float EvalRow(float y, float py) {
return - y * std::log(py) - (1.0f - y) * std::log(1 - py);
const float eps = 1e-16f;
const float pneg = 1.0f - py;
if (py < eps) {
return -y * std::log(eps) - (1.0f - y) * std::log(1.0f - eps);
} else if (pneg < eps) {
return -y * std::log(1.0f - eps) - (1.0f - y) * std::log(eps);
} else {
return -y * std::log(py) - (1.0f - y) * std::log(pneg);
}
}
};
@ -111,17 +119,29 @@ struct EvalMClassBase : public IEvaluator {
utils::Check(preds.size() % info.labels.size() == 0,
"label and prediction size not match");
const size_t nclass = preds.size() / info.labels.size();
utils::Check(nclass > 1,
"mlogloss and merror are only used for multi-class classification,"\
" use logloss for binary classification");
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
float sum = 0.0, wsum = 0.0;
int label_error = 0;
#pragma omp parallel for reduction(+: sum, wsum) schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const float wt = info.GetWeight(i);
int label = static_cast<int>(info.labels[i]);
if (label >= 0 && label < static_cast<int>(nclass)) {
sum += Derived::EvalRow(info.labels[i],
BeginPtr(preds) + i * nclass,
nclass) * wt;
wsum += wt;
} else {
label_error = label;
}
}
utils::Check(label_error >= 0 && label_error < static_cast<int>(nclass),
"MultiClassEvaluation: label must be in [0, num_class)," \
" num_class=%d but found %d in label",
static_cast<int>(nclass), label_error);
float dat[2]; dat[0] = sum, dat[1] = wsum;
if (distributed) {
rabit::Allreduce<rabit::op::Sum>(dat, 2);
@ -135,7 +155,7 @@ struct EvalMClassBase : public IEvaluator {
* \param pred prediction value of current instance
* \param nclass number of class in the prediction
*/
inline static float EvalRow(float label,
inline static float EvalRow(int label,
const float *pred,
size_t nclass);
/*!
@ -146,13 +166,15 @@ struct EvalMClassBase : public IEvaluator {
inline static float GetFinal(float esum, float wsum) {
return esum / wsum;
}
// used to store error message
const char *error_msg_;
};
/*! \brief match error */
struct EvalMatchError : public EvalMClassBase<EvalMatchError> {
virtual const char *Name(void) const {
return "merror";
}
inline static float EvalRow(float label,
inline static float EvalRow(int label,
const float *pred,
size_t nclass) {
return FindMaxIndex(pred, nclass) != static_cast<int>(label);
@ -163,12 +185,11 @@ struct EvalMultiLogLoss : public EvalMClassBase<EvalMultiLogLoss> {
virtual const char *Name(void) const {
return "mlogloss";
}
inline static float EvalRow(float label,
inline static float EvalRow(int label,
const float *pred,
size_t nclass) {
const float eps = 1e-16f;
size_t k = static_cast<size_t>(label);
utils::Check(k < nclass, "mlogloss: label must be in [0, num_class)");
if (pred[k] > eps) {
return -std::log(pred[k]);
} else {

View File

@ -43,6 +43,26 @@ inline static int FindMaxIndex(const std::vector<float>& rec) {
return FindMaxIndex(BeginPtr(rec), rec.size());
}
// perform numerical safe logsum
inline float LogSum(float x, float y) {
if (x < y) {
return y + std::log(std::exp(x - y) + 1.0f);
} else {
return x + std::log(std::exp(y - x) + 1.0f);
}
}
// numerical safe logsum
inline float LogSum(const float *rec, size_t size) {
float mx = rec[0];
for (size_t i = 1; i < size; ++i) {
mx = std::max(mx, rec[i]);
}
float sum = 0.0f;
for (size_t i = 0; i < size; ++i) {
sum += std::exp(rec[i] - mx);
}
return mx + std::log(sum);
}
inline static bool CmpFirst(const std::pair<float, unsigned> &a,
const std::pair<float, unsigned> &b) {

View File

@ -23,7 +23,7 @@ namespace learner {
* \brief learner that takes do gradient boosting on specific objective functions
* and do training and prediction
*/
class BoostLearner : public rabit::ISerializable {
class BoostLearner : public rabit::Serializable {
public:
BoostLearner(void) {
obj_ = NULL;
@ -163,34 +163,51 @@ class BoostLearner : public rabit::ISerializable {
bool calc_num_feature = true) {
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
"BoostLearner: wrong model format");
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
{
// backward compatibility code for compatible with old model type
// for new model, Read(&name_obj_) is suffice
size_t len;
utils::Check(fi.Read(&len, sizeof(len)) != 0, "BoostLearner: wrong model format");
if (len >= std::numeric_limits<unsigned>::max()) {
int gap;
utils::Check(fi.Read(&gap, sizeof(gap)) != 0, "BoostLearner: wrong model format");
len = len >> 32UL;
}
if (len != 0) {
name_obj_.resize(len);
utils::Check(fi.Read(&name_obj_[0], len) != 0, "BoostLearner: wrong model format");
}
}
utils::Check(fi.Read(&name_gbm_), "BoostLearner: wrong model format");
// delete existing gbm if any
if (obj_ != NULL) delete obj_;
if (gbm_ != NULL) delete gbm_;
this->InitTrainer(calc_num_feature);
this->InitObjGBM();
char tmp[32];
utils::SPrintf(tmp, sizeof(tmp), "%u", mparam.num_class);
obj_->SetParam("num_class", tmp);
gbm_->LoadModel(fi, with_pbuffer);
if (!with_pbuffer || distributed_mode == 2) {
gbm_->ResetPredBuffer(pred_buffer_size);
}
}
// rabit load model from rabit checkpoint
virtual void Load(rabit::IStream &fi) {
virtual void Load(rabit::Stream *fi) {
// for row split, we should not keep pbuffer
this->LoadModel(fi, distributed_mode != 2, false);
this->LoadModel(*fi, distributed_mode != 2, false);
}
// rabit save model to rabit checkpoint
virtual void Save(rabit::IStream &fo) const {
virtual void Save(rabit::Stream *fo) const {
// for row split, we should not keep pbuffer
this->SaveModel(fo, distributed_mode != 2);
this->SaveModel(*fo, distributed_mode != 2);
}
/*!
* \brief load model from file
* \param fname file name
*/
inline void LoadModel(const char *fname) {
utils::IStream *fi = rabit::io::CreateStream(fname, "r");
utils::IStream *fi = utils::IStream::Create(fname, "r");
std::string header; header.resize(4);
// check header for different binary encode
// can be base64 or binary
@ -204,7 +221,7 @@ class BoostLearner : public rabit::ISerializable {
this->LoadModel(*fi);
} else {
delete fi;
fi = rabit::io::CreateStream(fname, "r");
fi = utils::IStream::Create(fname, "r");
this->LoadModel(*fi);
}
delete fi;
@ -221,7 +238,7 @@ class BoostLearner : public rabit::ISerializable {
* \param save_base64 whether save in base64 format
*/
inline void SaveModel(const char *fname, bool save_base64 = false) const {
utils::IStream *fo = rabit::io::CreateStream(fname, "w");
utils::IStream *fo = utils::IStream::Create(fname, "w");
if (save_base64 != 0 || !strcmp(fname, "stdout")) {
fo->Write("bs64\t", 5);
utils::Base64OutStream bout(fo);

View File

@ -82,11 +82,13 @@ struct LossType {
* \return second order gradient
*/
inline float SecondOrderGradient(float predt, float label) const {
// cap second order gradient to postive value
const float eps = 1e-16f;
switch (loss_type) {
case kLinearSquare: return 1.0f;
case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
case kLogisticClassify:
case kLogisticNeglik: return predt * (1 - predt);
case kLogisticNeglik: return std::max(predt * (1.0f - predt), eps);
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
@ -195,6 +197,7 @@ class SoftmaxMultiClassObj : public IObjFunction {
gpair.resize(preds.size());
const unsigned nstep = static_cast<unsigned>(info.labels.size() * nclass);
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size() / nclass);
int label_error = 0;
#pragma omp parallel
{
std::vector<float> rec(nclass);
@ -206,8 +209,9 @@ class SoftmaxMultiClassObj : public IObjFunction {
Softmax(&rec);
const unsigned j = i % nstep;
int label = static_cast<int>(info.labels[j]);
utils::Check(label >= 0 && label < nclass,
"SoftmaxMultiClassObj: label must be in [0, num_class)");
if (label < 0 || label >= nclass) {
label_error = label; label = 0;
}
const float wt = info.GetWeight(j);
for (int k = 0; k < nclass; ++k) {
float p = rec[k];
@ -220,6 +224,9 @@ class SoftmaxMultiClassObj : public IObjFunction {
}
}
}
utils::Check(label_error >= 0 && label_error < nclass,
"SoftmaxMultiClassObj: label must be in [0, num_class),"\
" num_class=%d but found %d in label", nclass, label_error);
}
virtual void PredTransform(std::vector<float> *io_preds) {
this->Transform(io_preds, output_prob);

View File

@ -7,7 +7,6 @@
* \author Tianqi Chen
*/
#include "../../subtree/rabit/include/rabit.h"
#include "../../subtree/rabit/rabit-learn/io/io.h"
#endif // XGBOOST_SYNC_H_

View File

@ -28,6 +28,10 @@ struct TrainParam{
float reg_alpha;
// default direction choice
int default_direction;
// maximum delta update we can add in weight estimation
// this parameter can be used to stablize update
// default=0 means no constraint on weight delta
float max_delta_step;
// whether we want to do subsample
float subsample;
// whether to subsample columns each split, in each level
@ -52,6 +56,7 @@ struct TrainParam{
learning_rate = 0.3f;
min_split_loss = 0.0f;
min_child_weight = 1.0f;
max_delta_step = 0.0f;
max_depth = 6;
reg_lambda = 1.0f;
reg_alpha = 0.0f;
@ -81,6 +86,7 @@ struct TrainParam{
if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
if (!strcmp(name, "max_delta_step")) max_delta_step = static_cast<float>(atof(val));
if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
if (!strcmp(name, "reg_alpha")) reg_alpha = static_cast<float>(atof(val));
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
@ -102,11 +108,21 @@ struct TrainParam{
// calculate the cost of loss function
inline double CalcGain(double sum_grad, double sum_hess) const {
if (sum_hess < min_child_weight) return 0.0;
if (max_delta_step == 0.0f) {
if (reg_alpha == 0.0f) {
return Sqr(sum_grad) / (sum_hess + reg_lambda);
} else {
return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda);
}
} else {
double w = CalcWeight(sum_grad, sum_hess);
double ret = sum_grad * w + 0.5 * (sum_hess + reg_lambda) * Sqr(w);
if (reg_alpha == 0.0f) {
return - 2.0 * ret;
} else {
return - 2.0 * (ret + reg_alpha * std::abs(w));
}
}
}
// calculate cost of loss function with four stati
inline double CalcGain(double sum_grad, double sum_hess,
@ -122,11 +138,17 @@ struct TrainParam{
// calculate weight given the statistics
inline double CalcWeight(double sum_grad, double sum_hess) const {
if (sum_hess < min_child_weight) return 0.0;
double dw;
if (reg_alpha == 0.0f) {
return -sum_grad / (sum_hess + reg_lambda);
dw = -sum_grad / (sum_hess + reg_lambda);
} else {
return -ThresholdL1(sum_grad, reg_alpha) / (sum_hess + reg_lambda);
dw = -ThresholdL1(sum_grad, reg_alpha) / (sum_hess + reg_lambda);
}
if (max_delta_step != 0.0f) {
if (dw > max_delta_step) dw = max_delta_step;
if (dw < -max_delta_step) dw = -max_delta_step;
}
return dw;
}
/*! \brief whether need forward small to big search: default right */
inline bool need_forward_search(float col_density = 0.0f) const {

View File

@ -406,7 +406,8 @@ class ColMaker: public IUpdater {
c.SetSubstract(snode[nid].stats, e.stats);
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
const float delta = d_step == +1 ? rt_eps : -rt_eps;
const float gap = std::abs(e.last_fvalue) + rt_eps;
const float delta = d_step == +1 ? gap: -gap;
e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
}
}
@ -497,6 +498,9 @@ class ColMaker: public IUpdater {
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
if (ridx >= position.size()) {
utils::Printf("ridx exceed bound\n");
}
const int nid = this->DecodePosition(ridx);
if (tree[nid].is_leaf()) {
// mark finish when it is not a fresh leaf

View File

@ -1,5 +1,5 @@
#ifndef RABIT_LEARN_IO_BASE64_INL_H_
#define RABIT_LEARN_IO_BASE64_INL_H_
#ifndef XGBOOST_UTILS_BASE64_INL_H_
#define XGBOOST_UTILS_BASE64_INL_H_
/*!
* \file base64.h
* \brief data stream support to input and output from/to base64 stream
@ -9,10 +9,54 @@
#include <cctype>
#include <cstdio>
#include "./io.h"
#include "./buffer_reader-inl.h"
namespace rabit {
namespace io {
namespace xgboost {
namespace utils {
/*! \brief buffer reader of the stream that allows you to get */
class StreamBufferReader {
public:
StreamBufferReader(size_t buffer_size)
:stream_(NULL),
read_len_(1), read_ptr_(1) {
buffer_.resize(buffer_size);
}
/*!
* \brief set input stream
*/
inline void set_stream(IStream *stream) {
stream_ = stream;
read_len_ = read_ptr_ = 1;
}
/*!
* \brief allows quick read using get char
*/
inline char GetChar(void) {
while (true) {
if (read_ptr_ < read_len_) {
return buffer_[read_ptr_++];
} else {
read_len_ = stream_->Read(&buffer_[0], buffer_.length());
if (read_len_ == 0) return EOF;
read_ptr_ = 0;
}
}
}
/*! \brief whether we are reaching the end of file */
inline bool AtEnd(void) const {
return read_len_ == 0;
}
private:
/*! \brief the underlying stream */
IStream *stream_;
/*! \brief buffer to hold data */
std::string buffer_;
/*! \brief length of valid data in buffer */
size_t read_len_;
/*! \brief pointer in the buffer */
size_t read_ptr_;
};
/*! \brief namespace of base64 decoding and encoding table */
namespace base64 {
const char DecodeTable[] = {
@ -209,9 +253,11 @@ class Base64OutStream: public IStream {
if (out_buf.length() >= kBufferSize) Flush();
}
inline void Flush(void) {
fp->Write(BeginPtr(out_buf), out_buf.length());
if (out_buf.length() != 0) {
fp->Write(&out_buf[0], out_buf.length());
out_buf.clear();
}
}
};
} // namespace utils
} // namespace rabit

View File

@ -14,12 +14,10 @@
namespace xgboost {
namespace utils {
// reuse the definitions of streams
typedef rabit::IStream IStream;
typedef rabit::utils::ISeekStream ISeekStream;
typedef rabit::Stream IStream;
typedef rabit::utils::SeekStream ISeekStream;
typedef rabit::utils::MemoryFixSizeBuffer MemoryFixSizeBuffer;
typedef rabit::utils::MemoryBufferStream MemoryBufferStream;
typedef rabit::io::Base64InStream Base64InStream;
typedef rabit::io::Base64OutStream Base64OutStream;
/*! \brief implementation of file i/o stream */
class FileStream : public ISeekStream {
@ -54,4 +52,6 @@ class FileStream : public ISeekStream {
};
} // namespace utils
} // namespace xgboost
#include "./base64-inl.h"
#endif

View File

@ -296,7 +296,17 @@ struct WXQSummary : public WQSummary<DType, RType> {
}
RType begin = src.data[0].rmax;
size_t n = maxsize - 1, nbig = 0;
const RType range = src.data[src.size - 1].rmin - begin;
RType range = src.data[src.size - 1].rmin - begin;
// prune off zero weights
if (range == 0.0f) {
// special case, contain only two effective data pts
this->data[0] = src.data[0];
this->data[1] = src.data[src.size - 1];
this->size = 2;
return;
} else {
range = std::max(range, static_cast<RType>(1e-3f));
}
const RType chunk = 2 * range / n;
// minimized range
RType mrange = 0;
@ -316,7 +326,19 @@ struct WXQSummary : public WQSummary<DType, RType> {
mrange += src.data[src.size-1].rmax_prev() - src.data[bid].rmin_next();
}
}
utils::Assert(nbig < n - 1, "too many large chunk");
if (nbig >= n - 1) {
// see what was the case
fprintf(stderr, "LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n);
fprintf(stderr, "LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n",
src.size, maxsize, static_cast<double>(range),
static_cast<double>(chunk));
for (size_t i = 0; i < src.size; ++i) {
printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i,
src.data[i].rmin, src.data[i].rmax, src.data[i].wmin,
src.data[i].value, CheckLarge(src.data[i], chunk));
}
utils::Assert(nbig < n - 1, "quantile: too many large chunk");
}
this->data[0] = src.data[0];
this->size = 1;
// use smaller size
@ -619,6 +641,7 @@ class QuantileSketchTemplate {
* \param x the elemented added to the sketch
*/
inline void Push(DType x, RType w = 1) {
if (w == static_cast<RType>(0)) return;
if (inqueue.qtail == inqueue.queue.size()) {
// jump from lazy one value to limit_size * 2
if (inqueue.queue.size() == 1) {

View File

@ -5,7 +5,8 @@ rabit is a light weight library that provides a fault tolerant interface of Allr
* [Tutorial](guide)
* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
* You can also directly read the [interface header](include/rabit.h)
* [Machine Learning Tools](rabit-learn)
* [Distributed Machine Learning Tools](https://github.com/dmlc/wormhole)
- Rabit is one of the backbone library to support wormhole machine learning tools
Features
====
@ -33,5 +34,4 @@ Contributing
Rabit is an open-source library, contributions are welcomed, including:
* The rabit core library.
* Customized tracker script for new platforms and interface of new languages.
* Toolkits, benchmarks, resource (links to related repos).
* Tutorial and examples about the library.

View File

@ -95,7 +95,7 @@ WARN_LOGFILE =
#---------------------------------------------------------------------------
# configuration options related to the input files
#---------------------------------------------------------------------------
INPUT =
INPUT = . dmlc
INPUT_ENCODING = UTF-8
FILE_PATTERNS =
RECURSIVE = NO

View File

@ -151,7 +151,7 @@ This section trys to gives examples of different aspectes of rabit API.
#### Structure of a Rabit Program
The following code illustrates the common structure of a rabit program. This is an abstract example,
you can also refer to [kmeans.cc](../rabit-learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
you can also refer to [wormhole](https://github.com/dmlc/wormhole/blob/master/learn/kmeans/kmeans.cc) for an example implementation of kmeans algorithm.
```c++
#include <rabit.h>

View File

@ -0,0 +1,4 @@
This folder is part of dmlc-core library, this allows rabit to use unified stream interface with other dmlc projects.
- Since it is only interface dependency DMLC core is not required to compile rabit
- To compile project that uses dmlc-core functions, link to libdmlc.a (provided by dmlc-core) will be required.

View File

@ -0,0 +1,333 @@
/*!
* Copyright (c) 2015 by Contributors
* \file io.h
* \brief defines serializable interface of dmlc
*/
#ifndef DMLC_IO_H_
#define DMLC_IO_H_
#include <cstdio>
#include <string>
#include <vector>
#include <istream>
#include <ostream>
#include <streambuf>
#include <cassert>
/*! \brief namespace for dmlc */
namespace dmlc {
/*!
* \brief interface of stream I/O for serialization
*/
class Stream {
public:
/*!
* \brief reads data from a stream
* \param ptr pointer to a memory buffer
* \param size block size
* \return the size of data read
*/
virtual size_t Read(void *ptr, size_t size) = 0;
/*!
* \brief writes data to a stream
* \param ptr pointer to a memory buffer
* \param size block size
*/
virtual void Write(const void *ptr, size_t size) = 0;
/*! \brief virtual destructor */
virtual ~Stream(void) {}
/*!
* \brief generic factory function
* create an stream, the stream will close the underlying files
* upon deletion
* \param uri the uri of the input currently we support
* hdfs://, s3://, and file:// by default file:// will be used
* \param flag can be "w", "r", "a"
*/
static Stream *Create(const char *uri, const char* const flag);
// helper functions to write/read different data structures
/*!
* \brief writes a vector
* \param vec vector to be written/serialized
*/
template<typename T>
inline void Write(const std::vector<T> &vec);
/*!
* \brief loads a vector
* \param out_vec vector to be loaded/deserialized
* \return whether the load was successful
*/
template<typename T>
inline bool Read(std::vector<T> *out_vec);
/*!
* \brief writes a string
* \param str the string to be written/serialized
*/
inline void Write(const std::string &str);
/*!
* \brief loads a string
* \param out_str string to be loaded/deserialized
* \return whether the load/deserialization was successful
*/
inline bool Read(std::string *out_str);
};
/*! \brief interface of i/o stream that support seek */
class SeekStream: public Stream {
public:
// virtual destructor
virtual ~SeekStream(void) {}
/*! \brief seek to certain position of the file */
virtual void Seek(size_t pos) = 0;
/*! \brief tell the position of the stream */
virtual size_t Tell(void) = 0;
/*! \return whether we are at end of file */
virtual bool AtEnd(void) const = 0;
};
/*! \brief interface for serializable objects */
class Serializable {
public:
/*!
* \brief load the model from a stream
* \param fi stream where to load the model from
*/
virtual void Load(Stream *fi) = 0;
/*!
* \brief saves the model to a stream
* \param fo stream where to save the model to
*/
virtual void Save(Stream *fo) const = 0;
};
/*!
* \brief input split header, used to create input split on input dataset
* this class can be used to obtain filesystem invariant splits from input files
*/
class InputSplit {
public:
/*!
* \brief read next record, store into out_data
* the data in outcomming record depends on the input data format
* if input is text data, each line is returned as a record (\n not included)
* if input is recordio, each record is returned
* \param out_data the string that stores the line data, \n is not included
* \return true of next line was found, false if we read all the lines
*/
virtual bool ReadRecord(std::string *out_data) = 0;
/*! \brief destructor*/
virtual ~InputSplit(void) {}
/*!
* \brief factory function:
* create input split given a uri
* \param uri the uri of the input, can contain hdfs prefix
* \param part_index the part id of current input
* \param num_parts total number of splits
*/
static InputSplit* Create(const char *uri,
unsigned part_index,
unsigned num_parts);
};
/*!
* \brief a std::ostream class that can can wrap Stream objects,
* can use ostream with that output to underlying Stream
*
* Usage example:
* \code
*
* Stream *fs = Stream::Create("hdfs:///test.txt", "w");
* dmlc::ostream os(fs);
* os << "hello world" << std::endl;
* delete fs;
* \endcode
*/
class ostream : public std::basic_ostream<char> {
public:
/*!
* \brief construct std::ostream type
* \param stream the Stream output to be used
* \param buffer_size internal streambuf size
*/
explicit ostream(Stream *stream,
size_t buffer_size = 1 << 10)
: std::basic_ostream<char>(NULL), buf_(buffer_size) {
this->set_stream(stream);
}
// explictly synchronize the buffer
virtual ~ostream() {
buf_.pubsync();
}
/*!
* \brief set internal stream to be stream, reset states
* \param stream new stream as output
*/
inline void set_stream(Stream *stream) {
buf_.set_stream(stream);
this->rdbuf(&buf_);
}
private:
// internal streambuf
class OutBuf : public std::streambuf {
public:
explicit OutBuf(size_t buffer_size)
: stream_(NULL), buffer_(buffer_size) {
assert(buffer_.size() > 0);
}
// set stream to the buffer
inline void set_stream(Stream *stream);
private:
/*! \brief internal stream by StreamBuf */
Stream *stream_;
/*! \brief internal buffer */
std::vector<char> buffer_;
// override sync
inline int_type sync(void);
// override overflow
inline int_type overflow(int c);
};
/*! \brief buffer of the stream */
OutBuf buf_;
};
/*!
* \brief a std::istream class that can can wrap Stream objects,
* can use istream with that output to underlying Stream
*
* Usage example:
* \code
*
* Stream *fs = Stream::Create("hdfs:///test.txt", "r");
* dmlc::istream is(fs);
* is >> mydata;
* delete fs;
* \endcode
*/
class istream : public std::basic_istream<char> {
public:
/*!
* \brief construct std::ostream type
* \param stream the Stream output to be used
* \param buffer_size internal buffer size
*/
explicit istream(Stream *stream,
size_t buffer_size = 1 << 10)
: std::basic_istream<char>(NULL), buf_(buffer_size) {
this->set_stream(stream);
}
virtual ~istream() {}
/*!
* \brief set internal stream to be stream, reset states
* \param stream new stream as output
*/
inline void set_stream(Stream *stream) {
buf_.set_stream(stream);
this->rdbuf(&buf_);
}
private:
// internal streambuf
class InBuf : public std::streambuf {
public:
explicit InBuf(size_t buffer_size)
: stream_(NULL), buffer_(buffer_size) {
assert(buffer_.size() > 0);
}
// set stream to the buffer
inline void set_stream(Stream *stream);
private:
/*! \brief internal stream by StreamBuf */
Stream *stream_;
/*! \brief internal buffer */
std::vector<char> buffer_;
// override underflow
inline int_type underflow();
};
/*! \brief input buffer */
InBuf buf_;
};
// implementations of inline functions
template<typename T>
inline void Stream::Write(const std::vector<T> &vec) {
size_t sz = vec.size();
this->Write(&sz, sizeof(sz));
if (sz != 0) {
this->Write(&vec[0], sizeof(T) * sz);
}
}
template<typename T>
inline bool Stream::Read(std::vector<T> *out_vec) {
size_t sz;
if (this->Read(&sz, sizeof(sz)) == 0) return false;
out_vec->resize(sz);
if (sz != 0) {
if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false;
}
return true;
}
inline void Stream::Write(const std::string &str) {
size_t sz = str.length();
this->Write(&sz, sizeof(sz));
if (sz != 0) {
this->Write(&str[0], sizeof(char) * sz);
}
}
inline bool Stream::Read(std::string *out_str) {
size_t sz;
if (this->Read(&sz, sizeof(sz)) == 0) return false;
out_str->resize(sz);
if (sz != 0) {
if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) {
return false;
}
}
return true;
}
// implementations for ostream
inline void ostream::OutBuf::set_stream(Stream *stream) {
if (stream_ != NULL) this->pubsync();
this->stream_ = stream;
this->setp(&buffer_[0], &buffer_[0] + buffer_.size() - 1);
}
inline int ostream::OutBuf::sync(void) {
if (stream_ == NULL) return -1;
std::ptrdiff_t n = pptr() - pbase();
stream_->Write(pbase(), n);
this->pbump(-n);
return 0;
}
inline int ostream::OutBuf::overflow(int c) {
*(this->pptr()) = c;
std::ptrdiff_t n = pptr() - pbase();
this->pbump(-n);
if (c == EOF) {
stream_->Write(pbase(), n);
} else {
stream_->Write(pbase(), n + 1);
}
return c;
}
// implementations for istream
inline void istream::InBuf::set_stream(Stream *stream) {
stream_ = stream;
this->setg(&buffer_[0], &buffer_[0], &buffer_[0]);
}
inline int istream::InBuf::underflow() {
char *bhead = &buffer_[0];
if (this->gptr() == this->egptr()) {
size_t sz = stream_->Read(bhead, buffer_.size());
this->setg(bhead, bhead, bhead + sz);
}
if (this->gptr() == this->egptr()) {
return traits_type::eof();
} else {
return traits_type::to_int_type(*gptr());
}
}
} // namespace dmlc
#endif // DMLC_IO_H_

View File

@ -16,7 +16,7 @@
#if __cplusplus >= 201103L
#include <functional>
#endif // C++11
// contains definition of ISerializable
// contains definition of Serializable
#include "./rabit_serializable.h"
// engine definition of rabit, defines internal implementation
// to use rabit interface, there is no need to read engine.h
@ -183,8 +183,8 @@ inline void Allreduce(DType *sendrecvbuf, size_t count,
*
* \sa CheckPoint, VersionNumber
*/
inline int LoadCheckPoint(ISerializable *global_model,
ISerializable *local_model = NULL);
inline int LoadCheckPoint(Serializable *global_model,
Serializable *local_model = NULL);
/*!
* \brief checkpoints the model, meaning a stage of execution has finished.
* every time we call check point, a version number will be increased by one
@ -199,8 +199,8 @@ inline int LoadCheckPoint(ISerializable *global_model,
* So, only CheckPoint with the global_model if possible
* \sa LoadCheckPoint, VersionNumber
*/
inline void CheckPoint(const ISerializable *global_model,
const ISerializable *local_model = NULL);
inline void CheckPoint(const Serializable *global_model,
const Serializable *local_model = NULL);
/*!
* \brief This function can be used to replace CheckPoint for global_model only,
* when certain condition is met (see detailed explanation).
@ -222,7 +222,7 @@ inline void CheckPoint(const ISerializable *global_model,
* is the same in every node
* \sa LoadCheckPoint, CheckPoint, VersionNumber
*/
inline void LazyCheckPoint(const ISerializable *global_model);
inline void LazyCheckPoint(const Serializable *global_model);
/*!
* \return version number of the current stored model,
* which means how many calls to CheckPoint we made so far

View File

@ -94,8 +94,8 @@ class IEngine {
*
* \sa CheckPoint, VersionNumber
*/
virtual int LoadCheckPoint(ISerializable *global_model,
ISerializable *local_model = NULL) = 0;
virtual int LoadCheckPoint(Serializable *global_model,
Serializable *local_model = NULL) = 0;
/*!
* \brief checkpoints the model, meaning a stage of execution was finished
* every time we call check point, a version number increases by ones
@ -112,8 +112,8 @@ class IEngine {
*
* \sa LoadCheckPoint, VersionNumber
*/
virtual void CheckPoint(const ISerializable *global_model,
const ISerializable *local_model = NULL) = 0;
virtual void CheckPoint(const Serializable *global_model,
const Serializable *local_model = NULL) = 0;
/*!
* \brief This function can be used to replace CheckPoint for global_model only,
* when certain condition is met (see detailed explanation).
@ -134,7 +134,7 @@ class IEngine {
* is the same in every node
* \sa LoadCheckPoint, CheckPoint, VersionNumber
*/
virtual void LazyCheckPoint(const ISerializable *global_model) = 0;
virtual void LazyCheckPoint(const Serializable *global_model) = 0;
/*!
* \return version number of the current stored model,
* which means how many calls to CheckPoint we made so far

View File

@ -16,21 +16,10 @@
namespace rabit {
namespace utils {
/*! \brief interface of i/o stream that support seek */
class ISeekStream: public IStream {
public:
// virtual destructor
virtual ~ISeekStream(void) {}
/*! \brief seek to certain position of the file */
virtual void Seek(size_t pos) = 0;
/*! \brief tell the position of the stream */
virtual size_t Tell(void) = 0;
/*! \return whether we are at end of file */
virtual bool AtEnd(void) const = 0;
};
/*! \brief re-use definition of dmlc::SeekStream */
typedef dmlc::SeekStream SeekStream;
/*! \brief fixed size memory buffer */
struct MemoryFixSizeBuffer : public ISeekStream {
struct MemoryFixSizeBuffer : public SeekStream {
public:
MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size)
: p_buffer_(reinterpret_cast<char*>(p_buffer)),
@ -72,7 +61,7 @@ struct MemoryFixSizeBuffer : public ISeekStream {
}; // class MemoryFixSizeBuffer
/*! \brief a in memory buffer that can be read and write as stream interface */
struct MemoryBufferStream : public ISeekStream {
struct MemoryBufferStream : public SeekStream {
public:
explicit MemoryBufferStream(std::string *p_buffer)
: p_buffer_(p_buffer) {

View File

@ -178,17 +178,17 @@ inline void TrackerPrintf(const char *fmt, ...) {
}
#endif
// load latest check point
inline int LoadCheckPoint(ISerializable *global_model,
ISerializable *local_model) {
inline int LoadCheckPoint(Serializable *global_model,
Serializable *local_model) {
return engine::GetEngine()->LoadCheckPoint(global_model, local_model);
}
// checkpoint the model, meaning we finished a stage of execution
inline void CheckPoint(const ISerializable *global_model,
const ISerializable *local_model) {
inline void CheckPoint(const Serializable *global_model,
const Serializable *local_model) {
engine::GetEngine()->CheckPoint(global_model, local_model);
}
// lazy checkpoint the model, only remember the pointer to global_model
inline void LazyCheckPoint(const ISerializable *global_model) {
inline void LazyCheckPoint(const Serializable *global_model) {
engine::GetEngine()->LazyCheckPoint(global_model);
}
// return the version number of currently stored model

View File

@ -9,98 +9,19 @@
#include <vector>
#include <string>
#include "./rabit/utils.h"
#include "./dmlc/io.h"
namespace rabit {
/*!
* \brief interface of stream I/O, used by ISerializable
* \sa ISerializable
* \brief defines stream used in rabit
* see definition of Stream in dmlc/io.h
*/
class IStream {
public:
typedef dmlc::Stream Stream;
/*!
* \brief reads data from a stream
* \param ptr pointer to a memory buffer
* \param size block size
* \return the size of data read
* \brief defines serializable objects used in rabit
* see definition of Serializable in dmlc/io.h
*/
virtual size_t Read(void *ptr, size_t size) = 0;
/*!
* \brief writes data to a stream
* \param ptr pointer to a memory buffer
* \param size block size
*/
virtual void Write(const void *ptr, size_t size) = 0;
/*! \brief virtual destructor */
virtual ~IStream(void) {}
typedef dmlc::Serializable Serializable;
public:
// helper functions to write/read different data structures
/*!
* \brief writes a vector
* \param vec vector to be written/serialized
*/
template<typename T>
inline void Write(const std::vector<T> &vec) {
uint64_t sz = static_cast<uint64_t>(vec.size());
this->Write(&sz, sizeof(sz));
if (sz != 0) {
this->Write(&vec[0], sizeof(T) * sz);
}
}
/*!
* \brief loads a vector
* \param out_vec vector to be loaded/deserialized
* \return whether the load was successful
*/
template<typename T>
inline bool Read(std::vector<T> *out_vec) {
uint64_t sz;
if (this->Read(&sz, sizeof(sz)) == 0) return false;
out_vec->resize(sz);
if (sz != 0) {
if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false;
}
return true;
}
/*!
* \brief writes a string
* \param str the string to be written/serialized
*/
inline void Write(const std::string &str) {
uint64_t sz = static_cast<uint64_t>(str.length());
this->Write(&sz, sizeof(sz));
if (sz != 0) {
this->Write(&str[0], sizeof(char) * sz);
}
}
/*!
* \brief loads a string
* \param out_str string to be loaded/deserialized
* \return whether the load/deserialization was successful
*/
inline bool Read(std::string *out_str) {
uint64_t sz;
if (this->Read(&sz, sizeof(sz)) == 0) return false;
out_str->resize(sz);
if (sz != 0) {
if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) return false;
}
return true;
}
};
/*! \brief interface for serializable objects */
class ISerializable {
public:
/*!
* \brief load the model from a stream
* \param fi stream where to load the model from
*/
virtual void Load(IStream &fi) = 0;
/*!
* \brief saves the model to a stream
* \param fo stream where to save the model to
*/
virtual void Save(IStream &fo) const = 0;
};
} // namespace rabit
#endif // RABIT_RABIT_SERIALIZABLE_H_

View File

@ -1,2 +0,0 @@
config.mk
*.log

View File

@ -1,17 +0,0 @@
Rabit-Learn
====
This folder contains implementation of distributed machine learning algorithm using rabit.
It also contain links to the Machine Learning packages that uses rabit.
* Contribution of toolkits, examples, benchmarks is more than welcomed!
Toolkits
====
* [KMeans Clustering](kmeans)
* [Linear and Logistic Regression](linear)
* [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/master/multi-node)
- xgboost is a very fast boosted tree(also known as GBDT) library, that can run more than
10 times faster than existing packages
- Rabit carries xgboost to distributed enviroment, inheritating all the benefits of xgboost
single node version, and scale it to even larger problems

View File

@ -1,2 +0,0 @@
This folder contains processed example dataset used by the demos.
Copyright of the dataset belongs to the original copyright holder

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,126 +0,0 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

View File

@ -1,58 +0,0 @@
#ifndef RABIT_LEARN_IO_BUFFER_READER_INL_H_
#define RABIT_LEARN_IO_BUFFER_READER_INL_H_
/*!
* \file buffer_reader-inl.h
* \brief implementation of stream buffer reader
* \author Tianqi Chen
*/
#include "./io.h"
namespace rabit {
namespace io {
/*! \brief buffer reader of the stream that allows you to get */
class StreamBufferReader {
public:
StreamBufferReader(size_t buffer_size)
:stream_(NULL),
read_len_(1), read_ptr_(1) {
buffer_.resize(buffer_size);
}
/*!
* \brief set input stream
*/
inline void set_stream(IStream *stream) {
stream_ = stream;
read_len_ = read_ptr_ = 1;
}
/*!
* \brief allows quick read using get char
*/
inline char GetChar(void) {
while (true) {
if (read_ptr_ < read_len_) {
return buffer_[read_ptr_++];
} else {
read_len_ = stream_->Read(&buffer_[0], buffer_.length());
if (read_len_ == 0) return EOF;
read_ptr_ = 0;
}
}
}
/*! \brief whether we are reaching the end of file */
inline bool AtEnd(void) const {
return read_len_ == 0;
}
private:
/*! \brief the underlying stream */
IStream *stream_;
/*! \brief buffer to hold data */
std::string buffer_;
/*! \brief length of valid data in buffer */
size_t read_len_;
/*! \brief pointer in the buffer */
size_t read_ptr_;
};
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_BUFFER_READER_INL_H_

View File

@ -1,112 +0,0 @@
#ifndef RABIT_LEARN_IO_FILE_INL_H_
#define RABIT_LEARN_IO_FILE_INL_H_
/*!
* \file file-inl.h
* \brief normal filesystem I/O
* \author Tianqi Chen
*/
#include <string>
#include <vector>
#include <cstdio>
#include "./io.h"
#include "./line_split-inl.h"
/*! \brief io interface */
namespace rabit {
namespace io {
/*! \brief implementation of file i/o stream */
class FileStream : public utils::ISeekStream {
public:
explicit FileStream(const char *fname, const char *mode)
: use_stdio(false) {
using namespace std;
#ifndef RABIT_STRICT_CXX98_
if (!strcmp(fname, "stdin")) {
use_stdio = true; fp = stdin;
}
if (!strcmp(fname, "stdout")) {
use_stdio = true; fp = stdout;
}
#endif
if (!strncmp(fname, "file://", 7)) fname += 7;
if (!use_stdio) {
std::string flag = mode;
if (flag == "w") flag = "wb";
if (flag == "r") flag = "rb";
fp = utils::FopenCheck(fname, flag.c_str());
}
}
virtual ~FileStream(void) {
this->Close();
}
virtual size_t Read(void *ptr, size_t size) {
return std::fread(ptr, 1, size, fp);
}
virtual void Write(const void *ptr, size_t size) {
std::fwrite(ptr, size, 1, fp);
}
virtual void Seek(size_t pos) {
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
}
virtual size_t Tell(void) {
return std::ftell(fp);
}
virtual bool AtEnd(void) const {
return std::feof(fp) != 0;
}
inline void Close(void) {
if (fp != NULL && !use_stdio) {
std::fclose(fp); fp = NULL;
}
}
private:
std::FILE *fp;
bool use_stdio;
};
/*! \brief line split from normal file system */
class FileProvider : public LineSplitter::IFileProvider {
public:
explicit FileProvider(const char *uri) {
LineSplitter::SplitNames(&fnames_, uri, "#");
std::vector<size_t> fsize;
for (size_t i = 0; i < fnames_.size(); ++i) {
if (!std::strncmp(fnames_[i].c_str(), "file://", 7)) {
std::string tmp = fnames_[i].c_str() + 7;
fnames_[i] = tmp;
}
size_t fz = GetFileSize(fnames_[i].c_str());
if (fz != 0) {
fsize_.push_back(fz);
}
}
}
// destrucor
virtual ~FileProvider(void) {}
virtual utils::ISeekStream *Open(size_t file_index) {
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
return new FileStream(fnames_[file_index].c_str(), "rb");
}
virtual const std::vector<size_t> &FileSize(void) const {
return fsize_;
}
private:
// file sizes
std::vector<size_t> fsize_;
// file names
std::vector<std::string> fnames_;
// get file size
inline static size_t GetFileSize(const char *fname) {
std::FILE *fp = utils::FopenCheck(fname, "rb");
// NOTE: fseek may not be good, but serves as ok solution
std::fseek(fp, 0, SEEK_END);
size_t fsize = static_cast<size_t>(std::ftell(fp));
std::fclose(fp);
return fsize;
}
};
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_FILE_INL_H_

View File

@ -1,165 +0,0 @@
#ifndef RABIT_LEARN_IO_HDFS_INL_H_
#define RABIT_LEARN_IO_HDFS_INL_H_
/*!
* \file hdfs-inl.h
* \brief HDFS I/O
* \author Tianqi Chen
*/
#include <string>
#include <cstdlib>
#include <vector>
#include <hdfs.h>
#include <errno.h>
#include "./io.h"
#include "./line_split-inl.h"
/*! \brief io interface */
namespace rabit {
namespace io {
class HDFSStream : public ISeekStream {
public:
HDFSStream(hdfsFS fs,
const char *fname,
const char *mode,
bool disconnect_when_done)
: fs_(fs), at_end_(false),
disconnect_when_done_(disconnect_when_done) {
int flag = 0;
if (!strcmp(mode, "r")) {
flag = O_RDONLY;
} else if (!strcmp(mode, "w")) {
flag = O_WRONLY;
} else if (!strcmp(mode, "a")) {
flag = O_WRONLY | O_APPEND;
} else {
utils::Error("HDFSStream: unknown flag %s", mode);
}
fp_ = hdfsOpenFile(fs_, fname, flag, 0, 0, 0);
utils::Check(fp_ != NULL,
"HDFSStream: fail to open %s", fname);
}
virtual ~HDFSStream(void) {
this->Close();
if (disconnect_when_done_) {
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
}
}
virtual size_t Read(void *ptr, size_t size) {
tSize nread = hdfsRead(fs_, fp_, ptr, size);
if (nread == -1) {
int errsv = errno;
utils::Error("HDFSStream.Read Error:%s", strerror(errsv));
}
if (nread == 0) {
at_end_ = true;
}
return static_cast<size_t>(nread);
}
virtual void Write(const void *ptr, size_t size) {
const char *buf = reinterpret_cast<const char*>(ptr);
while (size != 0) {
tSize nwrite = hdfsWrite(fs_, fp_, buf, size);
if (nwrite == -1) {
int errsv = errno;
utils::Error("HDFSStream.Write Error:%s", strerror(errsv));
}
size_t sz = static_cast<size_t>(nwrite);
buf += sz; size -= sz;
}
}
virtual void Seek(size_t pos) {
if (hdfsSeek(fs_, fp_, pos) != 0) {
int errsv = errno;
utils::Error("HDFSStream.Seek Error:%s", strerror(errsv));
}
}
virtual size_t Tell(void) {
tOffset offset = hdfsTell(fs_, fp_);
if (offset == -1) {
int errsv = errno;
utils::Error("HDFSStream.Tell Error:%s", strerror(errsv));
}
return static_cast<size_t>(offset);
}
virtual bool AtEnd(void) const {
return at_end_;
}
inline void Close(void) {
if (fp_ != NULL) {
if (hdfsCloseFile(fs_, fp_) == -1) {
int errsv = errno;
utils::Error("HDFSStream.Close Error:%s", strerror(errsv));
}
fp_ = NULL;
}
}
inline static std::string GetNameNode(void) {
const char *nn = getenv("rabit_hdfs_namenode");
if (nn == NULL) {
return std::string("default");
} else {
return std::string(nn);
}
}
private:
hdfsFS fs_;
hdfsFile fp_;
bool at_end_;
bool disconnect_when_done_;
};
/*! \brief line split from normal file system */
class HDFSProvider : public LineSplitter::IFileProvider {
public:
explicit HDFSProvider(const char *uri) {
fs_ = hdfsConnect(HDFSStream::GetNameNode().c_str(), 0);
utils::Check(fs_ != NULL, "error when connecting to default HDFS");
std::vector<std::string> paths;
LineSplitter::SplitNames(&paths, uri, "#");
// get the files
for (size_t i = 0; i < paths.size(); ++i) {
hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
utils::Check(info != NULL, "path %s do not exist", paths[i].c_str());
if (info->mKind == 'D') {
int nentry;
hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
utils::Check(files != NULL, "error when ListDirectory %s", info->mName);
for (int i = 0; i < nentry; ++i) {
if (files[i].mKind == 'F' && files[i].mSize != 0) {
fsize_.push_back(files[i].mSize);
fnames_.push_back(std::string(files[i].mName));
}
}
hdfsFreeFileInfo(files, nentry);
} else {
if (info->mSize != 0) {
fsize_.push_back(info->mSize);
fnames_.push_back(std::string(info->mName));
}
}
hdfsFreeFileInfo(info, 1);
}
}
virtual ~HDFSProvider(void) {
utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
}
virtual const std::vector<size_t> &FileSize(void) const {
return fsize_;
}
virtual ISeekStream *Open(size_t file_index) {
utils::Assert(file_index < fnames_.size(), "file index exceed bound");
return new HDFSStream(fs_, fnames_[file_index].c_str(), "r", false);
}
private:
// hdfs handle
hdfsFS fs_;
// file sizes
std::vector<size_t> fsize_;
// file names
std::vector<std::string> fnames_;
};
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_HDFS_INL_H_

View File

@ -1,68 +0,0 @@
#ifndef RABIT_LEARN_IO_IO_INL_H_
#define RABIT_LEARN_IO_IO_INL_H_
/*!
* \file io-inl.h
* \brief Input/Output utils that handles read/write
* of files in distrubuted enviroment
* \author Tianqi Chen
*/
#include <cstring>
#include "./io.h"
#if RABIT_USE_HDFS
#include "./hdfs-inl.h"
#endif
#include "./file-inl.h"
namespace rabit {
namespace io {
/*!
* \brief create input split given a uri
* \param uri the uri of the input, can contain hdfs prefix
* \param part the part id of current input
* \param nsplit total number of splits
*/
inline InputSplit *CreateInputSplit(const char *uri,
unsigned part,
unsigned nsplit) {
using namespace std;
if (!strcmp(uri, "stdin")) {
return new SingleFileSplit(uri);
}
if (!strncmp(uri, "file://", 7)) {
return new LineSplitter(new FileProvider(uri), part, nsplit);
}
if (!strncmp(uri, "hdfs://", 7)) {
#if RABIT_USE_HDFS
return new LineSplitter(new HDFSProvider(uri), part, nsplit);
#else
utils::Error("Please compile with RABIT_USE_HDFS=1");
#endif
}
return new LineSplitter(new FileProvider(uri), part, nsplit);
}
/*!
* \brief create an stream, the stream must be able to close
* the underlying resources(files) when deleted
*
* \param uri the uri of the input, can contain hdfs prefix
* \param mode can be 'w' or 'r' for read or write
*/
inline IStream *CreateStream(const char *uri, const char *mode) {
using namespace std;
if (!strncmp(uri, "file://", 7)) {
return new FileStream(uri + 7, mode);
}
if (!strncmp(uri, "hdfs://", 7)) {
#if RABIT_USE_HDFS
return new HDFSStream(hdfsConnect(HDFSStream::GetNameNode().c_str(), 0),
uri, mode, true);
#else
utils::Error("Please compile with RABIT_USE_HDFS=1");
#endif
}
return new FileStream(uri, mode);
}
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_IO_INL_H_

View File

@ -1,62 +0,0 @@
#ifndef RABIT_LEARN_IO_IO_H_
#define RABIT_LEARN_IO_IO_H_
/*!
* \file io.h
* \brief Input/Output utils that handles read/write
* of files in distrubuted enviroment
* \author Tianqi Chen
*/
#include "../../include/rabit_serializable.h"
/*! \brief whether compile with HDFS support */
#ifndef RABIT_USE_HDFS
#define RABIT_USE_HDFS 0
#endif
/*! \brief io interface */
namespace rabit {
/*!
* \brief namespace to handle input split and filesystem interfacing
*/
namespace io {
/*! \brief reused ISeekStream's definition */
typedef utils::ISeekStream ISeekStream;
/*!
* \brief user facing input split helper,
* can be used to get the partition of data used by current node
*/
class InputSplit {
public:
/*!
* \brief get next line, store into out_data
* \param out_data the string that stores the line data,
* \n is not included
* \return true of next line was found, false if we read all the lines
*/
virtual bool NextLine(std::string *out_data) = 0;
/*! \brief destructor*/
virtual ~InputSplit(void) {}
};
/*!
* \brief create input split given a uri
* \param uri the uri of the input, can contain hdfs prefix
* \param part the part id of current input
* \param nsplit total number of splits
*/
inline InputSplit *CreateInputSplit(const char *uri,
unsigned part,
unsigned nsplit);
/*!
* \brief create an stream, the stream must be able to close
* the underlying resources(files) when deleted
*
* \param uri the uri of the input, can contain hdfs prefix
* \param mode can be 'w' or 'r' for read or write
*/
inline IStream *CreateStream(const char *uri, const char *mode);
} // namespace io
} // namespace rabit
#include "./io-inl.h"
#include "./base64-inl.h"
#endif // RABIT_LEARN_IO_IO_H_

View File

@ -1,206 +0,0 @@
#ifndef RABIT_LEARN_IO_LINE_SPLIT_INL_H_
#define RABIT_LEARN_IO_LINE_SPLIT_INL_H_
/*!
* \std::FILE line_split-inl.h
* \brief base implementation of line-spliter
* \author Tianqi Chen
*/
#include <vector>
#include <utility>
#include <cstring>
#include <string>
#include "../../include/rabit.h"
#include "./io.h"
#include "./buffer_reader-inl.h"
namespace rabit {
namespace io {
/*! \brief class that split the files by line */
class LineSplitter : public InputSplit {
public:
class IFileProvider {
public:
/*!
* \brief get the seek stream of given file_index
* \return the corresponding seek stream at head of the stream
* the seek stream's resource can be freed by calling delete
*/
virtual ISeekStream *Open(size_t file_index) = 0;
/*!
* \return const reference to size of each files
*/
virtual const std::vector<size_t> &FileSize(void) const = 0;
// virtual destructor
virtual ~IFileProvider() {}
};
// constructor
explicit LineSplitter(IFileProvider *provider,
unsigned rank,
unsigned nsplit)
: provider_(provider), fs_(NULL),
reader_(kBufferSize) {
this->Init(provider_->FileSize(), rank, nsplit);
}
// destructor
virtual ~LineSplitter() {
if (fs_ != NULL) {
delete fs_; fs_ = NULL;
}
// delete provider after destructing the streams
delete provider_;
}
// get next line
virtual bool NextLine(std::string *out_data) {
if (file_ptr_ >= file_ptr_end_ &&
offset_curr_ >= offset_end_) return false;
out_data->clear();
while (true) {
char c = reader_.GetChar();
if (reader_.AtEnd()) {
if (out_data->length() != 0) return true;
file_ptr_ += 1;
if (offset_curr_ >= offset_end_) return false;
if (offset_curr_ != file_offset_[file_ptr_]) {
utils::Error("warning: FILE size not calculated correctly\n");
offset_curr_ = file_offset_[file_ptr_];
}
utils::Assert(file_ptr_ + 1 < file_offset_.size(),
"boundary check");
delete fs_;
fs_ = provider_->Open(file_ptr_);
reader_.set_stream(fs_);
} else {
++offset_curr_;
if (c != '\r' && c != '\n' && c != EOF) {
*out_data += c;
} else {
if (out_data->length() != 0) return true;
if (file_ptr_ >= file_ptr_end_ &&
offset_curr_ >= offset_end_) return false;
}
}
}
}
/*!
* \brief split names given
* \param out_fname output std::FILE names
* \param uri_ the iput uri std::FILE
* \param dlm deliminetr
*/
inline static void SplitNames(std::vector<std::string> *out_fname,
const char *uri_,
const char *dlm) {
std::string uri = uri_;
char *p = std::strtok(BeginPtr(uri), dlm);
while (p != NULL) {
out_fname->push_back(std::string(p));
p = std::strtok(NULL, dlm);
}
}
private:
/*!
* \brief initialize the line spliter,
* \param file_size, size of each files
* \param rank the current rank of the data
* \param nsplit number of split we will divide the data into
*/
inline void Init(const std::vector<size_t> &file_size,
unsigned rank, unsigned nsplit) {
file_offset_.resize(file_size.size() + 1);
file_offset_[0] = 0;
for (size_t i = 0; i < file_size.size(); ++i) {
file_offset_[i + 1] = file_offset_[i] + file_size[i];
}
size_t ntotal = file_offset_.back();
size_t nstep = (ntotal + nsplit - 1) / nsplit;
offset_begin_ = std::min(nstep * rank, ntotal);
offset_end_ = std::min(nstep * (rank + 1), ntotal);
offset_curr_ = offset_begin_;
if (offset_begin_ == offset_end_) return;
file_ptr_ = std::upper_bound(file_offset_.begin(),
file_offset_.end(),
offset_begin_) - file_offset_.begin() - 1;
file_ptr_end_ = std::upper_bound(file_offset_.begin(),
file_offset_.end(),
offset_end_) - file_offset_.begin() - 1;
fs_ = provider_->Open(file_ptr_);
reader_.set_stream(fs_);
// try to set the starting position correctly
if (file_offset_[file_ptr_] != offset_begin_) {
fs_->Seek(offset_begin_ - file_offset_[file_ptr_]);
while (true) {
char c = reader_.GetChar();
if (!reader_.AtEnd()) ++offset_curr_;
if (c == '\n' || c == '\r' || c == EOF) return;
}
}
}
private:
/*! \brief FileProvider */
IFileProvider *provider_;
/*! \brief current input stream */
utils::ISeekStream *fs_;
/*! \brief file pointer of which file to read on */
size_t file_ptr_;
/*! \brief file pointer where the end of file lies */
size_t file_ptr_end_;
/*! \brief get the current offset */
size_t offset_curr_;
/*! \brief beginning of offset */
size_t offset_begin_;
/*! \brief end of the offset */
size_t offset_end_;
/*! \brief byte-offset of each file */
std::vector<size_t> file_offset_;
/*! \brief buffer reader */
StreamBufferReader reader_;
/*! \brief buffer size */
const static size_t kBufferSize = 256;
};
/*! \brief line split from single std::FILE */
class SingleFileSplit : public InputSplit {
public:
explicit SingleFileSplit(const char *fname) {
if (!std::strcmp(fname, "stdin")) {
#ifndef RABIT_STRICT_CXX98_
use_stdin_ = true; fp_ = stdin;
#endif
}
if (!use_stdin_) {
fp_ = utils::FopenCheck(fname, "r");
}
end_of_file_ = false;
}
virtual ~SingleFileSplit(void) {
if (!use_stdin_) std::fclose(fp_);
}
virtual bool NextLine(std::string *out_data) {
if (end_of_file_) return false;
out_data->clear();
while (true) {
char c = std::fgetc(fp_);
if (c == EOF) {
end_of_file_ = true;
}
if (c != '\r' && c != '\n' && c != EOF) {
*out_data += c;
} else {
if (out_data->length() != 0) return true;
if (end_of_file_) return false;
}
}
return false;
}
private:
std::FILE *fp_;
bool use_stdin_;
bool end_of_file_;
};
} // namespace io
} // namespace rabit
#endif // RABIT_LEARN_IO_LINE_SPLIT_INL_H_

View File

@ -1,2 +0,0 @@
kmeans
*.mpi

View File

@ -1,15 +0,0 @@
# specify tensor path
BIN = kmeans.rabit
MOCKBIN= kmeans.mock
MPIBIN = kmeans.mpi
# objectives that makes up rabit library
OBJ = kmeans.o
# common build script for programs
include ../make/common.mk
# dependenies here
kmeans.rabit: kmeans.o lib
kmeans.mock: kmeans.o lib
kmeans.mpi: kmeans.o libmpi
kmeans.o: kmeans.cc ../../src/*.h

View File

@ -1,129 +0,0 @@
Toolkit
====
This folder contains some example toolkits developed with rabit to help you get started.
KMeans
====
## Input File Format
KMeans uses LIBSVM format to parse the input. If you are not familiar with LIBSVM, <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">here</a> you will find more details.
The format is the following:
&lt;label&gt; &lt;index1&gt;:&lt;value1&gt; &lt;index2&gt;:&lt;value2&gt; ...
where label is a dummy integer value in this case (you can add 1's to every example), index&lt;x&gt; is the index for feature x, and value&lt;x&gt; is the feature x value.
## Output File Format
KMeans currently outputs the centroids as dense vectors. Each line in the output file corresponds to a centroid. The number of lines in the file must match the number of clusters K you specified in the command line.
## Example
Let's go over a more detailed example...
#### Preprocess
Download the smallwiki dataset used in the Machine Learning for Big Data class at University of Washington.
http://courses.cs.washington.edu/courses/cse547/14wi/datasets/smallwiki.zip
Unzip it, you should find three files:
* tfidf.txt: each row is in the form of “docid||termid1:tfidf1,termid2:tfidf2,...
* dictionary.txt: map of term to termid
* cluster0.txt: initial cluster centers. Won't needed.
The first thing to do is to convert the tfidf file format into the input format rabit supports, i.e. LIBSVM. For that, you can use a simple python script. The following should suffice. You should redirect the output to a file, let's say tfidf.libsvm.
```python
for line in open("tfidf.txt").read().splitlines():
example = line.split('|')[1].split(',')
example = ' '.join(example)
print '%s %s' % (1, example)
```
#### Compile
You will then need to build the KMeans program with ```make```, which will produce three binaries:
* kmeans.mpi: runs on MPI.
* kmeans.mock: uses a mock to simulate error conditions for testing purposes.
* kmeans.rabit: uses our C++ implementation.
#### Running with Hadoop
If you want to run it with Hadoop, you can execute the [./kmeans_hadoop.sh](./kmeans_hadoop.sh) script from your master node in cluster.
You will have to edit the file in order to specify the path to the Hadoop Streaming jar. Afterwards, you can execute it with the following arguments (in the exact same order):
* number of worker nodes in your Hadoop cluster (i.e. number of slave nodes)
* path to the input data (HDFS path where you put the preprocessed file in libsvm format)
* number of clusters K (let's use 20 for this example)
* number of iterations to perform (let's use just 5 iterations)
* output path (HDFS path where to store the output data, must be a non-existent folder)
The current implementation runs for the amount of iterations you specify in the command line argument. If you would like to add some convergence criteria (e.g. when no cluster assignment changes between iterations you stop or something like that) you will have to modify [./kmeans.cc](./kmeans.cc). We leave that as an exercise to the reader :)
You may have noticed that [./kmeans_hadoop.sh](./kmeans_hadoop.sh) uses kmeans.rabit binary, but you can also use kmeans.mock in order to easily test your system behavior in presence of failures. More on that later.
Don't forget to copy the preprocessed file into HDFS and create the output folder. For example, inside the bin folder in Hadoop, you can execute the following:
```bash
$ ./hadoop fs -mkdir kmeans
$ ./hadoop fs -mkdir kmeans/in
$ ./hadoop fs -put tfidf.libsvm kmeans/in
$ ./hadoop fs -mkdir kmeans/out
```
#### Running with MPI
You will need to have a MPI cluster installed, for example OpenMPI. In order to run the program, you can use mpirun to submit the job. This is a non-fault tolerant version as it is backed by MPI.
#### Running with Mock
As previously mentioned, you can execute the kmeans example, an any of your own, with the mock binary. This will allow you to test error conditions while you are developing your algorithms. As explained in the [Tutorial](../guide), passing the script certain parameters (e.g. mock=0,0,1,0) will cause certain node to exit after calling Allreduce/Broadcast in some iteration.
You can also run this locally, you will only need to split the input file into several smaller files, each will be used by a particular process in the shared memory environment. You can use some Unix command line tool such as split.
#### Processing Output
Once the program finishes running, you can fetch the output from HDFS. For example, inside the bin folder in Hadoop, you can execute the following:
```bash
$ ./hadoop fs -get kmeans/out/part-00000 kmeans.out
```
Each line of the output file is a centroid in dense format. As this dataset contains the words in dictionary.txt file, you can do some simple post processing to recover the top 10 words of each centroid. Something like this should work:
```python
words = {}
for line in open("dictionary.txt").read().splitlines():
word, index = line.split(' ')
words[int(index)] = word
from collections import defaultdict
clusters = defaultdict(list)
cluster_name = 0
for line in open("kmeans.out").read().splitlines():
line = line.split(' ')
clusters[cluster_name].extend(line)
cluster_name+=1
import numpy as np
for j, key in enumerate(clusters):
elements = clusters[key]
array = np.array(elements).astype(np.float32)
idx = np.argsort(array)[::-1][:10]
ws = []
for i in idx:
ws.append(words[i])
print 'cluster %d = %s' % (j, ' '.join(ws))
```

View File

@ -1,165 +0,0 @@
// this is a test case to test whether rabit can recover model when
// facing an exception
#include <rabit.h>
#include <rabit/utils.h>
#include <time.h>
#include "../utils/data.h"
using namespace rabit;
// kmeans model
class Model : public rabit::ISerializable {
public:
// matrix of centroids
Matrix centroids;
// load from stream
virtual void Load(rabit::IStream &fi) {
fi.Read(&centroids.nrow, sizeof(centroids.nrow));
fi.Read(&centroids.ncol, sizeof(centroids.ncol));
fi.Read(&centroids.data);
}
/*! \brief save the model to the stream */
virtual void Save(rabit::IStream &fo) const {
fo.Write(&centroids.nrow, sizeof(centroids.nrow));
fo.Write(&centroids.ncol, sizeof(centroids.ncol));
fo.Write(centroids.data);
}
virtual void InitModel(unsigned num_cluster, unsigned feat_dim) {
centroids.Init(num_cluster, feat_dim);
}
// normalize L2 norm
inline void Normalize(void) {
for (size_t i = 0; i < centroids.nrow; ++i) {
float *row = centroids[i];
double wsum = 0.0;
for (size_t j = 0; j < centroids.ncol; ++j) {
wsum += row[j] * row[j];
}
wsum = sqrt(wsum);
if (wsum < 1e-6) return;
float winv = 1.0 / wsum;
for (size_t j = 0; j < centroids.ncol; ++j) {
row[j] *= winv;
}
}
}
};
inline void InitCentroids(const SparseMat &data, Matrix *centroids) {
int num_cluster = centroids->nrow;
for (int i = 0; i < num_cluster; ++i) {
int index = Random(data.NumRow());
SparseMat::Vector v = data[index];
for (unsigned j = 0; j < v.length; ++j) {
(*centroids)[i][v[j].findex] = v[j].fvalue;
}
}
for (int i = 0; i < num_cluster; ++i) {
int proc = Random(rabit::GetWorldSize());
rabit::Broadcast((*centroids)[i], centroids->ncol * sizeof(float), proc);
}
}
inline double Cos(const float *row,
const SparseMat::Vector &v) {
double rdot = 0.0, rnorm = 0.0;
for (unsigned i = 0; i < v.length; ++i) {
rdot += row[v[i].findex] * v[i].fvalue;
rnorm += v[i].fvalue * v[i].fvalue;
}
return rdot / sqrt(rnorm);
}
inline size_t GetCluster(const Matrix &centroids,
const SparseMat::Vector &v) {
size_t imin = 0;
double dmin = Cos(centroids[0], v);
for (size_t k = 1; k < centroids.nrow; ++k) {
double dist = Cos(centroids[k], v);
if (dist > dmin) {
dmin = dist; imin = k;
}
}
return imin;
}
int main(int argc, char *argv[]) {
if (argc < 5) {
// intialize rabit engine
rabit::Init(argc, argv);
if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("Usage: <data_dir> num_cluster max_iter <out_model>\n");
}
rabit::Finalize();
return 0;
}
clock_t tStart = clock();
srand(0);
// load the data
SparseMat data;
data.Load(argv[1]);
// set the parameters
int num_cluster = atoi(argv[2]);
int max_iter = atoi(argv[3]);
// intialize rabit engine
rabit::Init(argc, argv);
// load model
Model model;
int iter = rabit::LoadCheckPoint(&model);
if (iter == 0) {
rabit::Allreduce<op::Max>(&data.feat_dim, 1);
model.InitModel(num_cluster, data.feat_dim);
InitCentroids(data, &model.centroids);
model.Normalize();
rabit::TrackerPrintf("[%d] start at %s\n",
rabit::GetRank(), rabit::GetProcessorName().c_str());
} else {
rabit::TrackerPrintf("[%d] restart iter=%d\n", rabit::GetRank(), iter);
}
const unsigned num_feat = data.feat_dim;
// matrix to store the result
Matrix temp;
for (int r = iter; r < max_iter; ++r) {
temp.Init(num_cluster, num_feat + 1, 0.0f);
#if __cplusplus >= 201103L
auto lazy_get_centroid = [&]()
#endif
{
// lambda function used to calculate the data if necessary
// this function may not be called when the result can be directly recovered
const size_t ndata = data.NumRow();
for (size_t i = 0; i < ndata; ++i) {
SparseMat::Vector v = data[i];
size_t k = GetCluster(model.centroids, v);
// temp[k] += v
for (size_t j = 0; j < v.length; ++j) {
temp[k][v[j].findex] += v[j].fvalue;
}
// use last column to record counts
temp[k][num_feat] += 1.0f;
}
};
// call allreduce
#if __cplusplus >= 201103L
rabit::Allreduce<op::Sum>(&temp.data[0], temp.data.size(), lazy_get_centroid);
#else
rabit::Allreduce<op::Sum>(&temp.data[0], temp.data.size());
#endif
// set number
for (int k = 0; k < num_cluster; ++k) {
float cnt = temp[k][num_feat];
utils::Check(cnt != 0.0f, "get zero sized cluster");
for (unsigned i = 0; i < num_feat; ++i) {
model.centroids[k][i] = temp[k][i] / cnt;
}
}
model.Normalize();
rabit::CheckPoint(&model);
}
// output the model file to somewhere
if (rabit::GetRank() == 0) {
model.centroids.Print(argv[4]);
}
rabit::TrackerPrintf("[%d] Time taken: %f seconds\n", rabit::GetRank(), static_cast<float>(clock() - tStart) / CLOCKS_PER_SEC);
rabit::Finalize();
return 0;
}

View File

@ -1,9 +0,0 @@
#!/bin/bash
if [ "$#" -lt 5 ];
then
echo "Usage: <nslaves> <input_data> <ncluster> <max_iteration> <output>"
exit -1
fi
#set path to hadoop streaming jar here
STREAMING_JAR=
python ../tracker/rabit_hadoop.py -hs $STREAMING_JAR -n $1 -i $2 -o $5 kmeans.rabit stdin $3 $4 stdout

View File

@ -1,2 +0,0 @@
mushroom.row*
*.model

View File

@ -1,21 +0,0 @@
ifneq ("$(wildcard ../config.mk)","")
config = ../config.mk
else
config = ../make/config.mk
endif
include $(config)
BIN = linear.rabit
MOCKBIN= linear.mock
MPIBIN =
# objectives that makes up rabit library
OBJ = linear.o
# common build script for programs
include ../make/common.mk
CFLAGS+=-fopenmp
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
# dependenies here
linear.rabit: linear.o lib
linear.mock: linear.o lib

View File

@ -1,48 +0,0 @@
Linear and Logistic Regression
====
* input format: LibSVM
* Local Example: [run-linear.sh](run-linear.sh)
* Runnig on YARN: [run-yarn.sh](run-yarn.sh)
- You will need to have YARN
- Modify ```../make/config.mk``` to set USE_HDFS=1 to compile with HDFS support
- Run build.sh on [../../yarn](../../yarn) on to build yarn jar file
Multi-Threading Optimization
====
* The code can be multi-threaded, we encourage you to use it
- Simply add ```nthread=k``` where k is the number of threads you want to use
* If you submit with YARN
- Use ```--vcores``` and ```-mem``` to request CPU and memory resources
- Some scheduler in YARN do not honor CPU request, you can request more memory to grab working slots
* Usually multi-threading improves speed in general
- You can use less workers and assign more resources to each of worker
- This usually means less communication overhead and faster running time
Parameters
====
All the parameters can be set by param=value
#### Important Parameters
* objective [default = logistic]
- can be linear or logistic
* base_score [default = 0.5]
- global bias, recommended set to mean value of label
* reg_L1 [default = 0]
- l1 regularization co-efficient
* reg_L2 [default = 1]
- l2 regularization co-efficient
* lbfgs_stop_tol [default = 1e-5]
- relative tolerance level of loss reduction with respect to initial loss
* max_lbfgs_iter [default = 500]
- maximum number of lbfgs iterations
### Optimization Related parameters
* min_lbfgs_iter [default = 5]
- minimum number of lbfgs iterations
* max_linesearch_iter [default = 100]
- maximum number of iterations in linesearch
* linesearch_c1 [default = 1e-4]
- c1 co-efficient in backoff linesearch
* linesarch_backoff [default = 0.5]
- backoff ratio in linesearch

View File

@ -1,227 +0,0 @@
#include "./linear.h"
#include "../io/io.h"
namespace rabit {
namespace linear {
class LinearObjFunction : public solver::IObjFunction<float> {
public:
// training threads
int nthread;
// L2 regularization
float reg_L2;
// model
LinearModel model;
// training data
SparseMat dtrain;
// solver
solver::LBFGSSolver<float> lbfgs;
// constructor
LinearObjFunction(void) {
lbfgs.SetObjFunction(this);
nthread = 1;
reg_L2 = 0.0f;
model.weight = NULL;
task = "train";
model_in = "NULL";
name_pred = "pred.txt";
model_out = "final.model";
}
virtual ~LinearObjFunction(void) {
}
// set parameters
inline void SetParam(const char *name, const char *val) {
model.param.SetParam(name, val);
lbfgs.SetParam(name, val);
if (!strcmp(name, "num_feature")) {
char ndigit[30];
sprintf(ndigit, "%lu", model.param.num_feature + 1);
lbfgs.SetParam("num_dim", ndigit);
}
if (!strcmp(name, "reg_L2")) {
reg_L2 = static_cast<float>(atof(val));
}
if (!strcmp(name, "nthread")) {
nthread = atoi(val);
}
if (!strcmp(name, "task")) task = val;
if (!strcmp(name, "model_in")) model_in = val;
if (!strcmp(name, "model_out")) model_out = val;
if (!strcmp(name, "name_pred")) name_pred = val;
}
inline void Run(void) {
if (model_in != "NULL") {
this->LoadModel(model_in.c_str());
}
if (task == "train") {
lbfgs.Run();
if (rabit::GetRank() == 0) {
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
}
} else if (task == "pred") {
this->TaskPred();
} else {
utils::Error("unknown task=%s", task.c_str());
}
}
inline void TaskPred(void) {
utils::Check(model_in != "NULL",
"must set model_in for task=pred");
FILE *fp = utils::FopenCheck(name_pred.c_str(), "w");
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
float pred = model.Predict(dtrain[i]);
fprintf(fp, "%g\n", pred);
}
fclose(fp);
printf("Finishing writing to %s\n", name_pred.c_str());
}
inline void LoadModel(const char *fname) {
IStream *fi = io::CreateStream(fname, "r");
std::string header; header.resize(4);
// check header for different binary encode
// can be base64 or binary
utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
// base64 format
if (header == "bs64") {
io::Base64InStream bsin(fi);
bsin.InitPosition();
model.Load(bsin);
} else if (header == "binf") {
model.Load(*fi);
} else {
utils::Error("invalid model file");
}
delete fi;
}
inline void SaveModel(const char *fname,
const float *wptr,
bool save_base64 = false) {
IStream *fo = io::CreateStream(fname, "w");
if (save_base64 != 0 || !strcmp(fname, "stdout")) {
fo->Write("bs64\t", 5);
io::Base64OutStream bout(fo);
model.Save(bout, wptr);
bout.Finish('\n');
} else {
fo->Write("binf", 4);
model.Save(*fo, wptr);
}
delete fo;
}
inline void LoadData(const char *fname) {
dtrain.Load(fname);
}
virtual size_t InitNumDim(void) {
if (model_in == "NULL") {
size_t ndim = dtrain.feat_dim;
rabit::Allreduce<rabit::op::Max>(&ndim, 1);
model.param.num_feature = std::max(ndim, model.param.num_feature);
}
return model.param.num_feature + 1;
}
virtual void InitModel(float *weight, size_t size) {
if (model_in == "NULL") {
memset(weight, 0.0f, size * sizeof(float));
model.param.InitBaseScore();
} else {
rabit::Broadcast(model.weight, size * sizeof(float), 0);
memcpy(weight, model.weight, size * sizeof(float));
}
}
// load model
virtual void Load(rabit::IStream &fi) {
fi.Read(&model.param, sizeof(model.param));
}
virtual void Save(rabit::IStream &fo) const {
fo.Write(&model.param, sizeof(model.param));
}
virtual double Eval(const float *weight, size_t size) {
if (nthread != 0) omp_set_num_threads(nthread);
utils::Check(size == model.param.num_feature + 1,
"size consistency check");
double sum_val = 0.0;
#pragma omp parallel for schedule(static) reduction(+:sum_val)
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
float py = model.param.PredictMargin(weight, dtrain[i]);
float fv = model.param.MarginToLoss(dtrain.labels[i], py);
sum_val += fv;
}
if (rabit::GetRank() == 0) {
// only add regularization once
if (reg_L2 != 0.0f) {
double sum_sqr = 0.0;
for (size_t i = 0; i < model.param.num_feature; ++i) {
sum_sqr += weight[i] * weight[i];
}
sum_val += 0.5 * reg_L2 * sum_sqr;
}
}
utils::Check(!std::isnan(sum_val), "nan occurs");
return sum_val;
}
virtual void CalcGrad(float *out_grad,
const float *weight,
size_t size) {
if (nthread != 0) omp_set_num_threads(nthread);
utils::Check(size == model.param.num_feature + 1,
"size consistency check");
memset(out_grad, 0.0f, sizeof(float) * size);
double sum_gbias = 0.0;
#pragma omp parallel for schedule(static) reduction(+:sum_gbias)
for (size_t i = 0; i < dtrain.NumRow(); ++i) {
SparseMat::Vector v = dtrain[i];
float py = model.param.Predict(weight, v);
float grad = model.param.PredToGrad(dtrain.labels[i], py);
for (index_t j = 0; j < v.length; ++j) {
out_grad[v[j].findex] += v[j].fvalue * grad;
}
sum_gbias += grad;
}
out_grad[model.param.num_feature] = static_cast<float>(sum_gbias);
if (rabit::GetRank() == 0) {
// only add regularization once
if (reg_L2 != 0.0f) {
for (size_t i = 0; i < model.param.num_feature; ++i) {
out_grad[i] += reg_L2 * weight[i];
}
}
}
}
private:
std::string task;
std::string model_in;
std::string model_out;
std::string name_pred;
};
} // namespace linear
} // namespace rabit
int main(int argc, char *argv[]) {
if (argc < 2) {
// intialize rabit engine
rabit::Init(argc, argv);
if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("Usage: <data_in> param=val\n");
}
rabit::Finalize();
return 0;
}
rabit::linear::LinearObjFunction *linear = new rabit::linear::LinearObjFunction();
if (!strcmp(argv[1], "stdin")) {
linear->LoadData(argv[1]);
rabit::Init(argc, argv);
} else {
rabit::Init(argc, argv);
linear->LoadData(argv[1]);
}
for (int i = 2; i < argc; ++i) {
char name[256], val[256];
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
linear->SetParam(name, val);
}
}
linear->Run();
delete linear;
rabit::Finalize();
return 0;
}

View File

@ -1,134 +0,0 @@
/*!
* Copyright (c) 2015 by Contributors
* \file linear.h
* \brief Linear and Logistic regression
*
* \author Tianqi Chen
*/
#ifndef RABIT_LINEAR_H_
#define RABIT_LINEAR_H_
#include <omp.h>
#include "../utils/data.h"
#include "../solver/lbfgs.h"
namespace rabit {
namespace linear {
/*! \brief simple linear model */
struct LinearModel {
struct ModelParam {
/*! \brief global bias */
float base_score;
/*! \brief number of features */
size_t num_feature;
/*! \brief loss type*/
int loss_type;
// reserved field
int reserved[16];
// constructor
ModelParam(void) {
memset(this, 0, sizeof(ModelParam));
base_score = 0.5f;
num_feature = 0;
loss_type = 1;
num_feature = 0;
}
// initialize base score
inline void InitBaseScore(void) {
utils::Check(base_score > 0.0f && base_score < 1.0f,
"base_score must be in (0,1) for logistic loss");
base_score = -std::log(1.0f / base_score - 1.0f);
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp("base_score", name)) {
base_score = static_cast<float>(atof(val));
}
if (!strcmp("num_feature", name)) {
num_feature = static_cast<size_t>(atol(val));
}
if (!strcmp("objective", name)) {
if (!strcmp("linear", val)) {
loss_type = 0;
} else if (!strcmp("logistic", val)) {
loss_type = 1;
} else {
utils::Error("unknown objective type %s\n", val);
}
}
}
// transform margin to prediction
inline float MarginToPred(float margin) const {
if (loss_type == 1) {
return 1.0f / (1.0f + std::exp(-margin));
} else {
return margin;
}
}
// margin to loss
inline float MarginToLoss(float label, float margin) const {
if (loss_type == 1) {
float nlogprob;
if (margin > 0.0f) {
nlogprob = std::log(1.0f + std::exp(-margin));
} else {
nlogprob = -margin + std::log(1.0f + std::exp(margin));
}
return label * nlogprob +
(1.0f -label) * (margin + nlogprob);
} else {
float diff = margin - label;
return 0.5f * diff * diff;
}
}
inline float PredToGrad(float label, float pred) const {
return pred - label;
}
inline float PredictMargin(const float *weight,
const SparseMat::Vector &v) const {
// weight[num_feature] is bias
float sum = base_score + weight[num_feature];
for (unsigned i = 0; i < v.length; ++i) {
if (v[i].findex >= num_feature) continue;
sum += weight[v[i].findex] * v[i].fvalue;
}
return sum;
}
inline float Predict(const float *weight,
const SparseMat::Vector &v) const {
return MarginToPred(PredictMargin(weight, v));
}
};
// model parameter
ModelParam param;
// weight corresponding to the model
float *weight;
LinearModel(void) : weight(NULL) {
}
~LinearModel(void) {
if (weight != NULL) delete [] weight;
}
// load model
inline void Load(rabit::IStream &fi) {
fi.Read(&param, sizeof(param));
if (weight == NULL) {
weight = new float[param.num_feature + 1];
}
fi.Read(weight, sizeof(float) * (param.num_feature + 1));
}
inline void Save(rabit::IStream &fo, const float *wptr = NULL) {
fo.Write(&param, sizeof(param));
if (wptr == NULL) wptr = weight;
fo.Write(wptr, sizeof(float) * (param.num_feature + 1));
}
inline float Predict(const SparseMat::Vector &v) const {
return param.Predict(weight, v);
}
};
} // namespace linear
} // namespace rabit
#endif // RABIT_LINEAR_H_

View File

@ -1,20 +0,0 @@
#!/bin/bash
if [ "$#" -lt 3 ];
then
echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
exit -1
fi
# put the local training file to HDFS
hadoop fs -rm -r -f $2/data
hadoop fs -rm -r -f $2/mushroom.linear.model
hadoop fs -mkdir $2/data
hadoop fs -put ../data/agaricus.txt.train $2/data
# submit to hadoop
../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
# get the final model file
hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model

View File

@ -1,11 +0,0 @@
#!/bin/bash
if [[ $# -lt 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf *.model
k=$1
../../tracker/rabit_demo.py -n $k linear.mock ../data/agaricus.txt.train "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1

View File

@ -1,14 +0,0 @@
#!/bin/bash
if [[ $# -lt 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf *.model
k=$1
# run linear model, the program will automatically split the inputs
../../tracker/rabit_demo.py -n $k linear.rabit ../data/agaricus.txt.train reg_L1=1
./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model

View File

@ -1,20 +0,0 @@
#!/bin/bash
if [ "$#" -lt 3 ];
then
echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
exit -1
fi
# put the local training file to HDFS
hadoop fs -rm -r -f $2/mushroom.linear.model
hadoop fs -mkdir $2/data
hadoop fs -put ../data/agaricus.txt.train $2/data
# submit to hadoop
../../tracker/rabit_yarn.py -n $1 --vcores 1 ./linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}"
# get the final model file
hadoop fs -get $2/mushroom.linear.model ./linear.model
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model

View File

@ -1,39 +0,0 @@
# this is the common build script for rabit programs
# you do not have to use it
export LDFLAGS= -L../../lib -pthread -lm -lrt
export CFLAGS = -Wall -msse2 -Wno-unknown-pragmas -fPIC -I../../include
# setup opencv
ifeq ($(USE_HDFS),1)
CFLAGS+= -DRABIT_USE_HDFS=1 -I$(HADOOP_HDFS_HOME)/include -I$(JAVA_HOME)/include
LDFLAGS+= -L$(HADOOP_HDFS_HOME)/lib/native -L$(LIBJVM) -lhdfs -ljvm
else
CFLAGS+= -DRABIT_USE_HDFS=0
endif
.PHONY: clean all lib mpi
all: $(BIN) $(MOCKBIN)
mpi: $(MPIBIN)
lib:
cd ../..;make lib/librabit.a lib/librabit_mock.a; cd -
libmpi:
cd ../..;make lib/librabit_mpi.a;cd -
$(BIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) -lrabit $(LDFLAGS)
$(MOCKBIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) -lrabit_mock $(LDFLAGS)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
$(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) -lrabit_mpi
clean:
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MOCKBIN) *~ ../src/*~

View File

@ -1,21 +0,0 @@
#-----------------------------------------------------
# rabit-learn: the configuration compile script
#
# This is the default configuration setup for rabit-learn
# If you want to change configuration, do the following steps:
#
# - copy this file to the root of rabit-learn folder
# - modify the configuration you want
# - type make or make -j n on each of the folder
#----------------------------------------------------
# choice of compiler
export CC = gcc
export CXX = g++
export MPICXX = mpicxx
# whether use HDFS support during compile
USE_HDFS = 1
# path to libjvm.so
LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server

View File

@ -1,669 +0,0 @@
/*!
* Copyright (c) 2015 by Contributors
* \file lbfgs.h
* \brief L-BFGS solver for general optimization problem
*
* \author Tianqi Chen
*/
#ifndef RABIT_LEARN_LBFGS_H_
#define RABIT_LEARN_LBFGS_H_
#include <cmath>
#include <rabit.h>
namespace rabit {
/*! \brief namespace of solver for general problems */
namespace solver {
/*!
* \brief objective function for optimizers
* the objective function can also implement save/load
* to remember the state parameters that might need to remember
*/
template<typename DType>
class IObjFunction : public rabit::ISerializable {
public:
// destructor
virtual ~IObjFunction(void){}
/*!
* \brief evaluate function values for a given weight
* \param weight weight of the function
* \param size size of the weight
*/
virtual double Eval(const DType *weight, size_t size) = 0;
/*!
* \return number of feature dimension to be allocated
* only called once during initialization
*/
virtual size_t InitNumDim(void) = 0;
/*!
* \brief initialize the weight before starting the solver
* only called once for initialization
*/
virtual void InitModel(DType *weight, size_t size) = 0;
/*!
* \brief calculate gradient for a given weight
* \param out_grad used to store the gradient value of the function
* \param weight weight of the function
* \param size size of the weight
*/
virtual void CalcGrad(DType *out_grad,
const DType *weight,
size_t size) = 0;
};
/*! \brief a basic version L-BFGS solver */
template<typename DType>
class LBFGSSolver {
public:
LBFGSSolver(void) {
// set default values
reg_L1 = 0.0f;
max_linesearch_iter = 100;
linesearch_backoff = 0.5f;
linesearch_c1 = 1e-4;
min_lbfgs_iter = 5;
max_lbfgs_iter = 500;
lbfgs_stop_tol = 1e-5f;
silent = 0;
}
virtual ~LBFGSSolver(void) {}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam(const char *name, const char *val) {
if (!strcmp("num_dim", name)) {
gstate.num_dim = static_cast<size_t>(atol(val));
}
if (!strcmp("size_memory", name)) {
gstate.size_memory = static_cast<size_t>(atol(val));
}
if (!strcmp("reg_L1", name)) {
reg_L1 = static_cast<float>(atof(val));
}
if (!strcmp("lbfgs_stop_tol", name)) {
lbfgs_stop_tol = static_cast<float>(atof(val));
}
if (!strcmp("linesearch_backoff", name)) {
linesearch_backoff = static_cast<float>(atof(val));
}
if (!strcmp("max_linesearch_iter", name)) {
max_linesearch_iter = atoi(val);
}
if (!strcmp("max_lbfgs_iter", name)) {
max_lbfgs_iter = atoi(val);
}
if (!strcmp("min_lbfgs_iter", name)) {
min_lbfgs_iter = atoi(val);
}
if (!strcmp("linesearch_c1", name)) {
linesearch_c1 = static_cast<float>(atof(val));
}
}
/*!
* \brief set objective function to optimize
* the objective function only need to evaluate and calculate
* gradient with respect to current subset of data
* \param obj the objective function we are looking for
*/
virtual void SetObjFunction(IObjFunction<DType> *obj) {
gstate.obj = obj;
}
/*!
* \brief initialize the LBFGS solver
* user must already set the objective function
*/
virtual void Init(void) {
utils::Check(gstate.obj != NULL,
"LBFGSSolver.Init must SetObjFunction first");
int version = rabit::LoadCheckPoint(&gstate, &hist);
if (version == 0) {
gstate.num_dim = gstate.obj->InitNumDim();
} else {
printf("restart from version=%d\n", version);
}
{
// decide parameter partition
size_t nproc = rabit::GetWorldSize();
size_t rank = rabit::GetRank();
size_t step = (gstate.num_dim + nproc - 1) / nproc;
// upper align
step = (step + 7) / 8 * 8;
utils::Assert(step * nproc >= gstate.num_dim, "BUG");
range_begin_ = std::min(rank * step, gstate.num_dim);
range_end_ = std::min((rank + 1) * step, gstate.num_dim);
}
if (version == 0) {
gstate.Init();
hist.Init(range_end_ - range_begin_, gstate.size_memory);
gstate.obj->InitModel(gstate.weight, gstate.num_dim);
// broadcast initialize model
rabit::Broadcast(gstate.weight,
sizeof(DType) * gstate.num_dim, 0);
gstate.old_objval = this->Eval(gstate.weight);
gstate.init_objval = gstate.old_objval;
if (silent == 0 && rabit::GetRank() == 0) {
rabit::TrackerPrintf
("L-BFGS solver starts, num_dim=%lu, init_objval=%g, size_memory=%lu, RAM-approx=%lu\n",
gstate.num_dim, gstate.init_objval, gstate.size_memory,
gstate.MemCost() + hist.MemCost());
}
}
}
/*!
* \brief get the current weight vector
* note that if update function is called
* the content of weight vector is no longer valid
* \return weight vector
*/
virtual DType *GetWeight(void) {
return gstate.weight;
}
/*!
* \brief update the weight for one L-BFGS iteration
* \return whether stopping condition is met
*/
virtual bool UpdateOneIter(void) {
bool stop = false;
GlobalState &g = gstate;
g.obj->CalcGrad(g.grad, g.weight, g.num_dim);
rabit::Allreduce<rabit::op::Sum>(g.grad, g.num_dim);
// find change direction
double vdot = FindChangeDirection(g.tempw, g.grad, g.weight);
// line-search, g.grad is now new weight
int iter = BacktrackLineSearch(g.grad, g.tempw, g.weight, vdot);
utils::Check(iter < max_linesearch_iter, "line search failed");
// swap new weight
std::swap(g.weight, g.grad);
// check stop condition
if (gstate.num_iteration > static_cast<size_t>(min_lbfgs_iter)) {
if (g.old_objval - g.new_objval < lbfgs_stop_tol * g.init_objval) {
return true;
}
}
if (silent == 0 && rabit::GetRank() == 0) {
rabit::TrackerPrintf
("[%d] L-BFGS: linesearch finishes in %d rounds, new_objval=%g, improvment=%g\n",
gstate.num_iteration, iter,
gstate.new_objval,
gstate.old_objval - gstate.new_objval);
}
gstate.old_objval = gstate.new_objval;
rabit::CheckPoint(&gstate, &hist);
return stop;
}
/*! \brief run optimization */
virtual void Run(void) {
this->Init();
while (gstate.num_iteration < static_cast<size_t>(max_lbfgs_iter)) {
if (this->UpdateOneIter()) break;
}
if (silent == 0 && rabit::GetRank() == 0) {
size_t nonzero = 0;
for (size_t i = 0; i < gstate.num_dim; ++i) {
if (gstate.weight[i] != 0.0f) nonzero += 1;
}
rabit::TrackerPrintf
("L-BFGS: finishes at iteration %d, %lu/%lu active weights\n",
gstate.num_iteration, nonzero, gstate.num_dim);
}
}
protected:
// find the delta value, given gradient
// return dot(dir, l1grad)
virtual double FindChangeDirection(DType *dir,
const DType *grad,
const DType *weight) {
int m = static_cast<int>(gstate.size_memory);
int n = static_cast<int>(hist.num_useful());
if (n < m) {
utils::Assert(hist.num_useful() == gstate.num_iteration,
"BUG2, n=%d, it=%d", n, gstate.num_iteration);
} else {
utils::Assert(n == m, "BUG3");
}
const size_t num_dim = gstate.num_dim;
const DType *gsub = grad + range_begin_;
const size_t nsub = range_end_ - range_begin_;
double vdot = 0.0;
if (n != 0) {
// hist[m + n - 1] stores old gradient
Minus(hist[m + n - 1], gsub, hist[m + n - 1], nsub);
SetL1Dir(hist[2 * m], gsub, weight + range_begin_, nsub);
// index set for calculating results
std::vector<std::pair<size_t, size_t> > idxset;
for (int j = 0; j < n; ++j) {
idxset.push_back(std::make_pair(j, 2 * m));
idxset.push_back(std::make_pair(j, n - 1));
idxset.push_back(std::make_pair(j, m + n - 1));
}
for (int j = 0; j < n; ++j) {
idxset.push_back(std::make_pair(m + j, 2 * m));
idxset.push_back(std::make_pair(m + j, m + n - 1));
}
// calculate dot products
std::vector<double> tmp(idxset.size());
for (size_t i = 0; i < tmp.size(); ++i) {
tmp[i] = hist.CalcDot(idxset[i].first, idxset[i].second);
}
rabit::Allreduce<rabit::op::Sum>(BeginPtr(tmp), tmp.size());
for (size_t i = 0; i < tmp.size(); ++i) {
gstate.DotBuf(idxset[i].first, idxset[i].second) = tmp[i];
}
// BFGS steps, use vector-free update
// parameterize vector using basis in hist
std::vector<double> alpha(n);
std::vector<double> delta(2 * m + 1, 0.0);
delta[2 * m] = 1.0;
// backward step
for (int j = n - 1; j >= 0; --j) {
double vsum = 0.0;
for (size_t k = 0; k < delta.size(); ++k) {
vsum += delta[k] * gstate.DotBuf(k, j);
}
alpha[j] = vsum / gstate.DotBuf(j, m + j);
delta[m + j] = delta[m + j] - alpha[j];
}
// scale
double scale = gstate.DotBuf(n - 1, m + n - 1) /
gstate.DotBuf(m + n - 1, m + n - 1);
for (size_t k = 0; k < delta.size(); ++k) {
delta[k] *= scale;
}
// forward step
for (int j = 0; j < n; ++j) {
double vsum = 0.0;
for (size_t k = 0; k < delta.size(); ++k) {
vsum += delta[k] * gstate.DotBuf(k, m + j);
}
double beta = vsum / gstate.DotBuf(j, m + j);
delta[j] = delta[j] + (alpha[j] - beta);
}
// set all to zero
std::fill(dir, dir + num_dim, 0.0f);
DType *dirsub = dir + range_begin_;
for (int i = 0; i < n; ++i) {
AddScale(dirsub, dirsub, hist[m + i], delta[m + i], nsub);
}
AddScale(dirsub, dirsub, hist[2 * m], delta[2 * m], nsub);
for (int i = 0; i < n; ++i) {
AddScale(dirsub, dirsub, hist[i], delta[i], nsub);
}
FixDirL1Sign(dirsub, hist[2 * m], nsub);
vdot = -Dot(dirsub, hist[2 * m], nsub);
// allreduce to get full direction
rabit::Allreduce<rabit::op::Sum>(dir, num_dim);
rabit::Allreduce<rabit::op::Sum>(&vdot, 1);
} else {
SetL1Dir(dir, grad, weight, num_dim);
vdot = -Dot(dir, dir, num_dim);
}
// shift the history record
if (n < m) {
n += 1;
} else {
gstate.Shift(); hist.Shift();
}
hist.set_num_useful(n);
// copy gradient to hist[m + n - 1]
memcpy(hist[m + n - 1], gsub, nsub * sizeof(DType));
return vdot;
}
// line search for given direction
// return whether there is a descent
inline int BacktrackLineSearch(DType *new_weight,
const DType *dir,
const DType *weight,
double dot_dir_l1grad) {
utils::Assert(dot_dir_l1grad < 0.0f,
"gradient error, dotv=%g", dot_dir_l1grad);
double alpha = 1.0;
double backoff = linesearch_backoff;
// unit descent direction in first iter
if (gstate.num_iteration == 0) {
utils::Assert(hist.num_useful() == 1, "hist.nuseful");
alpha = 1.0f / std::sqrt(-dot_dir_l1grad);
backoff = 0.1f;
}
int iter = 0;
double old_val = gstate.old_objval;
double c1 = this->linesearch_c1;
while (true) {
const size_t num_dim = gstate.num_dim;
if (++iter >= max_linesearch_iter) return iter;
AddScale(new_weight, weight, dir, alpha, num_dim);
this->FixWeightL1Sign(new_weight, weight, num_dim);
double new_val = this->Eval(new_weight);
if (new_val - old_val <= c1 * dot_dir_l1grad * alpha) {
gstate.new_objval = new_val; break;
}
alpha *= backoff;
}
// hist[n - 1] = new_weight - weight
Minus(hist[hist.num_useful() - 1],
new_weight + range_begin_,
weight + range_begin_,
range_end_ - range_begin_);
gstate.num_iteration += 1;
return iter;
}
// OWL-QN step for L1 regularization
inline void SetL1Dir(DType *dst,
const DType *grad,
const DType *weight,
size_t size) {
if (reg_L1 == 0.0) {
for (size_t i = 0; i < size; ++i) {
dst[i] = -grad[i];
}
} else {
for (size_t i = 0; i < size; ++i) {
if (weight[i] > 0.0f) {
dst[i] = -grad[i] - reg_L1;
} else if (weight[i] < 0.0f) {
dst[i] = -grad[i] + reg_L1;
} else {
if (grad[i] < -reg_L1) {
dst[i] = -grad[i] - reg_L1;
} else if (grad[i] > reg_L1) {
dst[i] = -grad[i] + reg_L1;
} else {
dst[i] = 0.0;
}
}
}
}
}
// OWL-QN step: fix direction sign to be consistent with proposal
inline void FixDirL1Sign(DType *dir,
const DType *steepdir,
size_t size) {
if (reg_L1 != 0.0f) {
for (size_t i = 0; i < size; ++i) {
if (dir[i] * steepdir[i] <= 0.0f) {
dir[i] = 0.0f;
}
}
}
}
// QWL-QN step: fix direction sign to be consistent with proposal
inline void FixWeightL1Sign(DType *new_weight,
const DType *weight,
size_t size) {
if (reg_L1 != 0.0f) {
for (size_t i = 0; i < size; ++i) {
if (new_weight[i] * weight[i] < 0.0f) {
new_weight[i] = 0.0f;
}
}
}
}
inline double Eval(const DType *weight) {
double val = gstate.obj->Eval(weight, gstate.num_dim);
rabit::Allreduce<rabit::op::Sum>(&val, 1);
if (reg_L1 != 0.0f) {
double l1norm = 0.0;
for (size_t i = 0; i < gstate.num_dim; ++i) {
l1norm += std::abs(weight[i]);
}
val += l1norm * reg_L1;
}
return val;
}
private:
// helper functions
// dst = lhs + rhs * scale
inline static void AddScale(DType *dst,
const DType *lhs,
const DType *rhs,
DType scale,
size_t size) {
for (size_t i = 0; i < size; ++i) {
dst[i] = lhs[i] + rhs[i] * scale;
}
}
// dst = lhs - rhs
inline static void Minus(DType *dst,
const DType *lhs,
const DType *rhs,
size_t size) {
for (size_t i = 0; i < size; ++i) {
dst[i] = lhs[i] - rhs[i];
}
}
// return dot(lhs, rhs)
inline static double Dot(const DType *lhs,
const DType *rhs,
size_t size) {
double res = 0.0;
for (size_t i = 0; i < size; ++i) {
res += lhs[i] * rhs[i];
}
return res;
}
// map rolling array index
inline static size_t MapIndex(size_t i, size_t offset,
size_t size_memory) {
if (i == 2 * size_memory) return i;
if (i < size_memory) {
return (i + offset) % size_memory;
} else {
utils::Assert(i < 2 * size_memory,
"MapIndex: index exceed bound, i=%lu", i);
return (i + offset) % size_memory + size_memory;
}
}
// global solver state
struct GlobalState : public rabit::ISerializable {
public:
// memory size of L-BFGS
size_t size_memory;
// number of iterations passed
size_t num_iteration;
// number of features in the solver
size_t num_dim;
// initialize objective value
double init_objval;
// history objective value
double old_objval;
// new objective value
double new_objval;
// objective function
IObjFunction<DType> *obj;
// temporal storage
DType *grad, *weight, *tempw;
// constructor
GlobalState(void)
: obj(NULL), grad(NULL),
weight(NULL), tempw(NULL) {
size_memory = 10;
num_iteration = 0;
num_dim = 0;
old_objval = 0.0;
offset_ = 0;
}
~GlobalState(void) {
if (grad != NULL) {
delete [] grad;
delete [] weight;
delete [] tempw;
}
}
// intilize the space of rolling array
inline void Init(void) {
size_t n = size_memory * 2 + 1;
data.resize(n * n, 0.0);
this->AllocSpace();
}
// memory cost
inline size_t MemCost(void) const {
return sizeof(DType) * 3 * num_dim;
}
inline double &DotBuf(size_t i, size_t j) {
if (i > j) std::swap(i, j);
return data[MapIndex(i, offset_, size_memory) * (size_memory * 2 + 1) +
MapIndex(j, offset_, size_memory)];
}
// load the shift array
virtual void Load(rabit::IStream &fi) {
fi.Read(&size_memory, sizeof(size_memory));
fi.Read(&num_iteration, sizeof(num_iteration));
fi.Read(&num_dim, sizeof(num_dim));
fi.Read(&init_objval, sizeof(init_objval));
fi.Read(&old_objval, sizeof(old_objval));
fi.Read(&offset_, sizeof(offset_));
fi.Read(&data);
this->AllocSpace();
fi.Read(weight, sizeof(DType) * num_dim);
obj->Load(fi);
}
// save the shift array
virtual void Save(rabit::IStream &fo) const {
fo.Write(&size_memory, sizeof(size_memory));
fo.Write(&num_iteration, sizeof(num_iteration));
fo.Write(&num_dim, sizeof(num_dim));
fo.Write(&init_objval, sizeof(init_objval));
fo.Write(&old_objval, sizeof(old_objval));
fo.Write(&offset_, sizeof(offset_));
fo.Write(data);
fo.Write(weight, sizeof(DType) * num_dim);
obj->Save(fo);
}
inline void Shift(void) {
offset_ = (offset_ + 1) % size_memory;
}
private:
// rolling offset in the current memory
size_t offset_;
std::vector<double> data;
// allocate sapce
inline void AllocSpace(void) {
if (grad == NULL) {
grad = new DType[num_dim];
weight = new DType[num_dim];
tempw = new DType[num_dim];
}
}
};
/*! \brief rolling array that carries history information */
struct HistoryArray : public rabit::ISerializable {
public:
HistoryArray(void) : dptr_(NULL) {
num_useful_ = 0;
}
~HistoryArray(void) {
if (dptr_ != NULL) delete [] dptr_;
}
// intilize the space of rolling array
inline void Init(size_t num_col, size_t size_memory) {
if (dptr_ != NULL &&
(num_col_ != num_col || size_memory_ != size_memory)) {
delete dptr_;
}
num_col_ = num_col;
size_memory_ = size_memory;
stride_ = num_col_;
offset_ = 0;
size_t n = size_memory * 2 + 1;
dptr_ = new DType[n * stride_];
}
// memory cost
inline size_t MemCost(void) const {
return sizeof(DType) * (size_memory_ * 2 + 1) * stride_;
}
// fetch element from rolling array
inline const DType *operator[](size_t i) const {
return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;
}
inline DType *operator[](size_t i) {
return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;
}
// shift array: arr_old -> arr_new
// for i in [0, size_memory - 1), arr_new[i] = arr_old[i + 1]
// for i in [size_memory, 2 * size_memory - 1), arr_new[i] = arr_old[i + 1]
// arr_old[0] and arr_arr[size_memory] will be discarded
inline void Shift(void) {
offset_ = (offset_ + 1) % size_memory_;
}
inline double CalcDot(size_t i, size_t j) const {
return Dot((*this)[i], (*this)[j], num_col_);
}
// set number of useful memory
inline const size_t &num_useful(void) const {
return num_useful_;
}
// set number of useful memory
inline void set_num_useful(size_t num_useful) {
utils::Assert(num_useful <= size_memory_,
"num_useful exceed bound");
num_useful_ = num_useful;
}
// load the shift array
virtual void Load(rabit::IStream &fi) {
fi.Read(&num_col_, sizeof(num_col_));
fi.Read(&stride_, sizeof(stride_));
fi.Read(&size_memory_, sizeof(size_memory_));
fi.Read(&num_useful_, sizeof(num_useful_));
this->Init(num_col_, size_memory_);
for (size_t i = 0; i < num_useful_; ++i) {
fi.Read((*this)[i], num_col_ * sizeof(DType));
fi.Read((*this)[i + size_memory_], num_col_ * sizeof(DType));
}
}
// save the shift array
virtual void Save(rabit::IStream &fi) const {
fi.Write(&num_col_, sizeof(num_col_));
fi.Write(&stride_, sizeof(stride_));
fi.Write(&size_memory_, sizeof(size_memory_));
fi.Write(&num_useful_, sizeof(num_useful_));
for (size_t i = 0; i < num_useful_; ++i) {
fi.Write((*this)[i], num_col_ * sizeof(DType));
fi.Write((*this)[i + size_memory_], num_col_ * sizeof(DType));
}
}
private:
// number of columns in each of array
size_t num_col_;
// stride for each of column for alignment
size_t stride_;
// memory size of L-BFGS
size_t size_memory_;
// number of useful memory that will be used
size_t num_useful_;
// rolling offset in the current memory
size_t offset_;
// data pointer
DType *dptr_;
};
// data structure for LBFGS
GlobalState gstate;
HistoryArray hist;
// silent
int silent;
// the subrange of current node
size_t range_begin_;
size_t range_end_;
// L1 regularization co-efficient
float reg_L1;
// c1 ratio for line search
float linesearch_c1;
float linesearch_backoff;
int max_linesearch_iter;
int max_lbfgs_iter;
int min_lbfgs_iter;
float lbfgs_stop_tol;
};
} // namespace solver
} // namespace rabit
#endif // RABIT_LEARN_LBFGS_H_

View File

@ -1,138 +0,0 @@
/*!
* Copyright (c) 2015 by Contributors
* \file data.h
* \brief simple data structure that could be used by model
*
* \author Tianqi Chen
*/
#ifndef RABIT_LEARN_DATA_H_
#define RABIT_LEARN_DATA_H_
#include <vector>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <limits>
#include <cmath>
#include <sstream>
#include <rabit.h>
#include "../io/io.h"
namespace rabit {
// typedef index type
typedef unsigned index_t;
/*! \brief sparse matrix, CSR format */
struct SparseMat {
// sparse matrix entry
struct Entry {
// feature index
index_t findex;
// feature value
float fvalue;
};
// sparse vector
struct Vector {
const Entry *data;
index_t length;
inline const Entry &operator[](size_t i) const {
return data[i];
}
};
inline Vector operator[](size_t i) const {
Vector v;
v.data = &data[0] + row_ptr[i];
v.length = static_cast<index_t>(row_ptr[i + 1]-row_ptr[i]);
return v;
}
// load data from LibSVM format
inline void Load(const char *fname) {
io::InputSplit *in =
io::CreateInputSplit
(fname, rabit::GetRank(),
rabit::GetWorldSize());
row_ptr.clear();
row_ptr.push_back(0);
data.clear();
feat_dim = 0;
std::string line;
while (in->NextLine(&line)) {
float label;
std::istringstream ss(line);
ss >> label;
Entry e;
unsigned long fidx;
while (!ss.eof()) {
if (!(ss >> fidx)) break;
ss.ignore(32, ':');
if (!(ss >> e.fvalue)) break;
e.findex = static_cast<index_t>(fidx);
data.push_back(e);
feat_dim = std::max(fidx, feat_dim);
}
labels.push_back(label);
row_ptr.push_back(data.size());
}
delete in;
feat_dim += 1;
utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
"feature dimension exceed limit of index_t"\
"consider change the index_t to unsigned long");
}
inline size_t NumRow(void) const {
return row_ptr.size() - 1;
}
// memory cost
inline size_t MemCost(void) const {
return data.size() * sizeof(Entry);
}
// maximum feature dimension
size_t feat_dim;
std::vector<size_t> row_ptr;
std::vector<Entry> data;
std::vector<float> labels;
};
// dense matrix
struct Matrix {
inline void Init(size_t nrow, size_t ncol, float v = 0.0f) {
this->nrow = nrow;
this->ncol = ncol;
data.resize(nrow * ncol);
std::fill(data.begin(), data.end(), v);
}
inline float *operator[](size_t i) {
return &data[0] + i * ncol;
}
inline const float *operator[](size_t i) const {
return &data[0] + i * ncol;
}
inline void Print(const char *fname) {
FILE *fo;
if (!strcmp(fname, "stdout")) {
fo = stdout;
} else {
fo = utils::FopenCheck(fname, "w");
}
for (size_t i = 0; i < data.size(); ++i) {
fprintf(fo, "%g", data[i]);
if ((i+1) % ncol == 0) {
fprintf(fo, "\n");
} else {
fprintf(fo, " ");
}
}
// close the filed
if (fo != stdout) fclose(fo);
}
// number of data
size_t nrow, ncol;
std::vector<float> data;
};
/*!\brief computes a random number modulo the value */
inline int Random(int value) {
return rand() % value;
}
} // namespace rabit
#endif // RABIT_LEARN_DATA_H_

View File

@ -31,6 +31,7 @@ AllreduceBase::AllreduceBase(void) {
// tracker URL
task_id = "NULL";
err_link = NULL;
dmlc_role = "worker";
this->SetParam("rabit_reduce_buffer", "256MB");
// setup possible enviroment variable of intrest
env_vars.push_back("rabit_task_id");
@ -39,6 +40,12 @@ AllreduceBase::AllreduceBase(void) {
env_vars.push_back("rabit_reduce_ring_mincount");
env_vars.push_back("rabit_tracker_uri");
env_vars.push_back("rabit_tracker_port");
// also include dmlc support direct variables
env_vars.push_back("DMLC_TASK_ID");
env_vars.push_back("DMLC_ROLE");
env_vars.push_back("DMLC_NUM_ATTEMPT");
env_vars.push_back("DMLC_TRACKER_URI");
env_vars.push_back("DMLC_TRACKER_PORT");
}
// initialization function
@ -86,6 +93,10 @@ void AllreduceBase::Init(void) {
this->SetParam("rabit_world_size", num_task);
}
}
if (dmlc_role != "worker") {
fprintf(stderr, "Rabit Module currently only work with dmlc worker, quit this program by exit 0\n");
exit(0);
}
// clear the setting before start reconnection
this->rank = -1;
//---------------------
@ -150,6 +161,10 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
if (!strcmp(name, "rabit_tracker_uri")) tracker_uri = val;
if (!strcmp(name, "rabit_tracker_port")) tracker_port = atoi(val);
if (!strcmp(name, "rabit_task_id")) task_id = val;
if (!strcmp(name, "DMLC_TRACKER_URI")) tracker_uri = val;
if (!strcmp(name, "DMLC_TRACKER_PORT")) tracker_port = atoi(val);
if (!strcmp(name, "DMLC_TASK_ID")) task_id = val;
if (!strcmp(name, "DMLC_ROLE")) dmlc_role = val;
if (!strcmp(name, "rabit_world_size")) world_size = atoi(val);
if (!strcmp(name, "rabit_hadoop_mode")) hadoop_mode = atoi(val);
if (!strcmp(name, "rabit_reduce_ring_mincount")) {

View File

@ -126,8 +126,8 @@ class AllreduceBase : public IEngine {
*
* \sa CheckPoint, VersionNumber
*/
virtual int LoadCheckPoint(ISerializable *global_model,
ISerializable *local_model = NULL) {
virtual int LoadCheckPoint(Serializable *global_model,
Serializable *local_model = NULL) {
return 0;
}
/*!
@ -146,8 +146,8 @@ class AllreduceBase : public IEngine {
*
* \sa LoadCheckPoint, VersionNumber
*/
virtual void CheckPoint(const ISerializable *global_model,
const ISerializable *local_model = NULL) {
virtual void CheckPoint(const Serializable *global_model,
const Serializable *local_model = NULL) {
version_number += 1;
}
/*!
@ -170,7 +170,7 @@ class AllreduceBase : public IEngine {
* is the same in all nodes
* \sa LoadCheckPoint, CheckPoint, VersionNumber
*/
virtual void LazyCheckPoint(const ISerializable *global_model) {
virtual void LazyCheckPoint(const Serializable *global_model) {
version_number += 1;
}
/*!
@ -496,6 +496,8 @@ class AllreduceBase : public IEngine {
std::string host_uri;
// uri of tracker
std::string tracker_uri;
// role in dmlc jobs
std::string dmlc_role;
// port of tracker address
int tracker_port;
// port of slave process

View File

@ -5,8 +5,8 @@
*
* \author Ignacio Cano, Tianqi Chen
*/
#ifndef RABIT_ALLREDUCE_MOCK_H
#define RABIT_ALLREDUCE_MOCK_H
#ifndef RABIT_ALLREDUCE_MOCK_H_
#define RABIT_ALLREDUCE_MOCK_H_
#include <vector>
#include <map>
#include <sstream>
@ -31,6 +31,7 @@ class AllreduceMock : public AllreduceRobust {
AllreduceRobust::SetParam(name, val);
// additional parameters
if (!strcmp(name, "rabit_num_trial")) num_trial = atoi(val);
if (!strcmp(name, "DMLC_NUM_ATTEMPT")) num_trial = atoi(val);
if (!strcmp(name, "report_stats")) report_stats = atoi(val);
if (!strcmp(name, "force_local")) force_local = atoi(val);
if (!strcmp(name, "mock")) {
@ -57,8 +58,8 @@ class AllreduceMock : public AllreduceRobust {
this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "Broadcast");
AllreduceRobust::Broadcast(sendrecvbuf_, total_size, root);
}
virtual int LoadCheckPoint(ISerializable *global_model,
ISerializable *local_model) {
virtual int LoadCheckPoint(Serializable *global_model,
Serializable *local_model) {
tsum_allreduce = 0.0;
time_checkpoint = utils::GetTime();
if (force_local == 0) {
@ -69,8 +70,8 @@ class AllreduceMock : public AllreduceRobust {
return AllreduceRobust::LoadCheckPoint(&dum, &com);
}
}
virtual void CheckPoint(const ISerializable *global_model,
const ISerializable *local_model) {
virtual void CheckPoint(const Serializable *global_model,
const Serializable *local_model) {
this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "CheckPoint");
double tstart = utils::GetTime();
double tbet_chkpt = tstart - time_checkpoint;
@ -95,7 +96,7 @@ class AllreduceMock : public AllreduceRobust {
tsum_allreduce = 0.0;
}
virtual void LazyCheckPoint(const ISerializable *global_model) {
virtual void LazyCheckPoint(const Serializable *global_model) {
this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "LazyCheckPoint");
AllreduceRobust::LazyCheckPoint(global_model);
}
@ -109,28 +110,28 @@ class AllreduceMock : public AllreduceRobust {
double time_checkpoint;
private:
struct DummySerializer : public ISerializable {
virtual void Load(IStream &fi) {
struct DummySerializer : public Serializable {
virtual void Load(Stream *fi) {
}
virtual void Save(IStream &fo) const {
virtual void Save(Stream *fo) const {
}
};
struct ComboSerializer : public ISerializable {
ISerializable *lhs;
ISerializable *rhs;
const ISerializable *c_lhs;
const ISerializable *c_rhs;
ComboSerializer(ISerializable *lhs, ISerializable *rhs)
struct ComboSerializer : public Serializable {
Serializable *lhs;
Serializable *rhs;
const Serializable *c_lhs;
const Serializable *c_rhs;
ComboSerializer(Serializable *lhs, Serializable *rhs)
: lhs(lhs), rhs(rhs), c_lhs(lhs), c_rhs(rhs) {
}
ComboSerializer(const ISerializable *lhs, const ISerializable *rhs)
ComboSerializer(const Serializable *lhs, const Serializable *rhs)
: lhs(NULL), rhs(NULL), c_lhs(lhs), c_rhs(rhs) {
}
virtual void Load(IStream &fi) {
virtual void Load(Stream *fi) {
if (lhs != NULL) lhs->Load(fi);
if (rhs != NULL) rhs->Load(fi);
}
virtual void Save(IStream &fo) const {
virtual void Save(Stream *fo) const {
if (c_lhs != NULL) c_lhs->Save(fo);
if (c_rhs != NULL) c_rhs->Save(fo);
}
@ -172,4 +173,4 @@ class AllreduceMock : public AllreduceRobust {
};
} // namespace engine
} // namespace rabit
#endif // RABIT_ALLREDUCE_MOCK_H
#endif // RABIT_ALLREDUCE_MOCK_H_

View File

@ -158,8 +158,8 @@ void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root)
*
* \sa CheckPoint, VersionNumber
*/
int AllreduceRobust::LoadCheckPoint(ISerializable *global_model,
ISerializable *local_model) {
int AllreduceRobust::LoadCheckPoint(Serializable *global_model,
Serializable *local_model) {
// skip action in single node
if (world_size == 1) return 0;
this->LocalModelCheck(local_model != NULL);
@ -175,7 +175,7 @@ int AllreduceRobust::LoadCheckPoint(ISerializable *global_model,
// load in local model
utils::MemoryFixSizeBuffer fs(BeginPtr(local_chkpt[local_chkpt_version]),
local_rptr[local_chkpt_version][1]);
local_model->Load(fs);
local_model->Load(&fs);
} else {
utils::Assert(nlocal == 0, "[%d] local model inconsistent, nlocal=%d", rank, nlocal);
}
@ -189,7 +189,7 @@ int AllreduceRobust::LoadCheckPoint(ISerializable *global_model,
} else {
utils::Assert(fs.Read(&version_number, sizeof(version_number)) != 0,
"read in version number");
global_model->Load(fs);
global_model->Load(&fs);
utils::Assert(local_model == NULL || nlocal == num_local_replica + 1,
"local model inconsistent, nlocal=%d", nlocal);
}
@ -241,8 +241,8 @@ void AllreduceRobust::LocalModelCheck(bool with_local) {
*
* \sa CheckPoint, LazyCheckPoint
*/
void AllreduceRobust::CheckPoint_(const ISerializable *global_model,
const ISerializable *local_model,
void AllreduceRobust::CheckPoint_(const Serializable *global_model,
const Serializable *local_model,
bool lazy_checkpt) {
// never do check point in single machine mode
if (world_size == 1) {
@ -261,7 +261,7 @@ void AllreduceRobust::CheckPoint_(const ISerializable *global_model,
local_chkpt[new_version].clear();
utils::MemoryBufferStream fs(&local_chkpt[new_version]);
if (local_model != NULL) {
local_model->Save(fs);
local_model->Save(&fs);
}
local_rptr[new_version].clear();
local_rptr[new_version].push_back(0);
@ -287,7 +287,7 @@ void AllreduceRobust::CheckPoint_(const ISerializable *global_model,
global_checkpoint.resize(0);
utils::MemoryBufferStream fs(&global_checkpoint);
fs.Write(&version_number, sizeof(version_number));
global_model->Save(fs);
global_model->Save(&fs);
global_lazycheck = NULL;
}
// reset result buffer
@ -748,7 +748,7 @@ AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
global_checkpoint.resize(0);
utils::MemoryBufferStream fs(&global_checkpoint);
fs.Write(&version_number, sizeof(version_number));
global_lazycheck->Save(fs);
global_lazycheck->Save(&fs);
global_lazycheck = NULL;
}
// recover global checkpoint

View File

@ -80,8 +80,8 @@ class AllreduceRobust : public AllreduceBase {
*
* \sa CheckPoint, VersionNumber
*/
virtual int LoadCheckPoint(ISerializable *global_model,
ISerializable *local_model = NULL);
virtual int LoadCheckPoint(Serializable *global_model,
Serializable *local_model = NULL);
/*!
* \brief checkpoint the model, meaning we finished a stage of execution
* every time we call check point, there is a version number which will increase by one
@ -98,8 +98,8 @@ class AllreduceRobust : public AllreduceBase {
*
* \sa LoadCheckPoint, VersionNumber
*/
virtual void CheckPoint(const ISerializable *global_model,
const ISerializable *local_model = NULL) {
virtual void CheckPoint(const Serializable *global_model,
const Serializable *local_model = NULL) {
this->CheckPoint_(global_model, local_model, false);
}
/*!
@ -122,7 +122,7 @@ class AllreduceRobust : public AllreduceBase {
* is the same in all nodes
* \sa LoadCheckPoint, CheckPoint, VersionNumber
*/
virtual void LazyCheckPoint(const ISerializable *global_model) {
virtual void LazyCheckPoint(const Serializable *global_model) {
this->CheckPoint_(global_model, NULL, true);
}
/*!
@ -318,8 +318,8 @@ class AllreduceRobust : public AllreduceBase {
*
* \sa CheckPoint, LazyCheckPoint
*/
void CheckPoint_(const ISerializable *global_model,
const ISerializable *local_model,
void CheckPoint_(const Serializable *global_model,
const Serializable *local_model,
bool lazy_checkpt);
/*!
* \brief reset the all the existing links by sending Out-of-Band message marker
@ -521,7 +521,7 @@ o * the input state must exactly one saved state(local state of current node)
// last check point global model
std::string global_checkpoint;
// lazy checkpoint of global model
const ISerializable *global_lazycheck;
const Serializable *global_lazycheck;
// number of replica for local state/model
int num_local_replica;
// number of default local replica

View File

@ -34,15 +34,15 @@ class EmptyEngine : public IEngine {
virtual void InitAfterException(void) {
utils::Error("EmptyEngine is not fault tolerant");
}
virtual int LoadCheckPoint(ISerializable *global_model,
ISerializable *local_model = NULL) {
virtual int LoadCheckPoint(Serializable *global_model,
Serializable *local_model = NULL) {
return 0;
}
virtual void CheckPoint(const ISerializable *global_model,
const ISerializable *local_model = NULL) {
virtual void CheckPoint(const Serializable *global_model,
const Serializable *local_model = NULL) {
version_number += 1;
}
virtual void LazyCheckPoint(const ISerializable *global_model) {
virtual void LazyCheckPoint(const Serializable *global_model) {
version_number += 1;
}
virtual int VersionNumber(void) const {

View File

@ -37,15 +37,15 @@ class MPIEngine : public IEngine {
virtual void InitAfterException(void) {
utils::Error("MPI is not fault tolerant");
}
virtual int LoadCheckPoint(ISerializable *global_model,
ISerializable *local_model = NULL) {
virtual int LoadCheckPoint(Serializable *global_model,
Serializable *local_model = NULL) {
return 0;
}
virtual void CheckPoint(const ISerializable *global_model,
const ISerializable *local_model = NULL) {
virtual void CheckPoint(const Serializable *global_model,
const Serializable *local_model = NULL) {
version_number += 1;
}
virtual void LazyCheckPoint(const ISerializable *global_model) {
virtual void LazyCheckPoint(const Serializable *global_model) {
version_number += 1;
}
virtual int VersionNumber(void) const {

Some files were not shown because too many files have changed in this diff Show More