Squashed 'subtree/rabit/' changes from fa99857..e81a11d

e81a11d Merge pull request #25 from daiyl0320/master
35c3b37 add retry mechanism to ConnectTracker and modify Listen backlog to 128 in rabit_traker.py
c71ed6f try deply doxygen
62e5647 try deply doxygen
732f1c6 try
2fa6e02 ok
0537665 minor
7b59dcb minor
5934950 new doc
f538187 ok
44b6049 new doc
387339b add more
9d4397a chg
2879a48 chg
30e3110 ok
9ff0301 add link translation
6b629c2 k
32e1955 ok
8f4839d fix
93137b2 ok
7eeeb79 reload recommonmark
a8f00cc minor
19b0f01 ok
dd01184 minor
c1cdc19 minor
fcf0f43 try rst
cbc21ae try
62ddfa7 tiny
aefc05c final change
2aee9b4 minor
fe4e7c2 ok
8001983 change to subtitle
5ca33e4 ok
88f7d24 update guide
29d43ab add code
fe8bb3b minor hack for readthedocs
229c71d Merge branch 'master' of ssh://github.com/dmlc/rabit
7424218 ok
d1d45bb Update README.md
1e8813f Update README.md
1ccc990 Update README.md
0323e06 remove readme
679a835 remove theme
7ea5b7c remove numpydoc to napoleon
b73e2be Merge branch 'master' of ssh://github.com/dmlc/rabit
1742283 ok
1838e25 Update python-requirements.txt
bc4e957 ok
fba6fc2 ok
0251101 ok
d50b905 ok
d4f2509 ok
cdf401a ok
fef0ef2 new doc
cef360d ok
c125d2a ok
270a49e add requirments
744f901 get the basic doc
1cb5cad Merge branch 'master' of ssh://github.com/dmlc/rabit
8cc07ba minor
d74f126 Update .travis.yml
52b3dcd Update .travis.yml
099581b Update .travis.yml
1258046 Update .travis.yml
7addac9 Update Makefile
0ea7adf Update .travis.yml
f858856 Update travis_script.sh
d8eac4a Update README.md
3cc49ad lint and travis
ceedf4e fix
fd8920c fix win32
8bbed35 modify
9520b90 Merge pull request #14 from dmlc/hjk41
df14bb1 fix type
f441dc7 replace tab with blankspace
2467942 remove unnecessary include
181ef47 defined long long and ulonglong
1582180 use int32_t to define int and int64_t to define long. in VC long is 32bit
e0b7da0 fix

git-subtree-dir: subtree/rabit
git-subtree-split: e81a11dd7ee3cff87a38a42901315821df018bae
This commit is contained in:
tqchen 2015-10-20 19:37:47 -07:00
parent 7258f3353c
commit a16289b204
44 changed files with 1195 additions and 532 deletions

3
.gitignore vendored
View File

@ -34,3 +34,6 @@
*tmp*
*.rabit
*.mock
dmlc-core
recommonmark
recom

51
.travis.yml Normal file
View File

@ -0,0 +1,51 @@
# disable sudo to use container based build
sudo: false
# Use Build Matrix to do lint and build seperately
env:
matrix:
- TASK=lint LINT_LANG=cpp
- TASK=lint LINT_LANG=python
- TASK=doc
- TASK=build CXX=g++
- TASK=test CXX=g++
# dependent apt packages
addons:
apt:
packages:
- doxygen
- libopenmpi-dev
- wget
- git
- libcurl4-openssl-dev
- unzip
- python-numpy
before_install:
- git clone https://github.com/dmlc/dmlc-core
- export TRAVIS=dmlc-core/scripts/travis/
- source ${TRAVIS}/travis_setup_env.sh
install:
- pip install cpplint pylint --user `whoami`
script: scripts/travis_script.sh
before_cache:
- ${TRAVIS}/travis_before_cache.sh
cache:
directories:
- ${HOME}/.cache/usr
notifications:
# Emails are sent to the committer's git-configured email address by default,
email:
on_success: change
on_failure: always

View File

@ -3,8 +3,19 @@ export CXX = g++
endif
export MPICXX = mpicxx
export LDFLAGS= -Llib -lrt
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -pedantic
export CFLAGS = -O3 -msse2 -fPIC $(WARNFLAGS)
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++0x
export CFLAGS = -O3 -msse2 $(WARNFLAGS)
ifndef WITH_FPIC
WITH_FPIC = 1
endif
ifeq ($(WITH_FPIC), 1)
CFLAGS += -fPIC
endif
ifndef LINT_LANG
LINT_LANG="all"
endif
# build path
BPATH=.
@ -15,7 +26,9 @@ OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(B
SLIB= wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so wrapper/librabit_wrapper_mpi.so
ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
HEADERS=src/*.h include/*.h include/rabit/*.h
.PHONY: clean all install mpi python
DMLC=dmlc-core
.PHONY: clean all install mpi python lint doc doxygen
all: lib/librabit.a lib/librabit_mock.a wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so lib/librabit_base.a
mpi: lib/librabit_mpi.a wrapper/librabit_wrapper_mpi.so
@ -52,6 +65,12 @@ $(ALIB):
$(SLIB) :
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
clean:
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~
lint:
$(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include wrapper
doc doxygen:
cd include; doxygen ../doc/Doxyfile; cd -
clean:
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~

View File

@ -1,4 +1,6 @@
## rabit: Reliable Allreduce and Broadcast Interface
[![Build Status](https://travis-ci.org/dmlc/rabit.svg?branch=master)](https://travis-ci.org/dmlc/rabit)
[![Documentation Status](https://readthedocs.org/projects/rabit/badge/?version=latest)](http://rabit.readthedocs.org/)
rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support ***portable*** , ***scalable*** and ***reliable*** distributed machine learning programs.

2
doc/.gitignore vendored
View File

@ -1,3 +1,5 @@
html
latex
*.sh
_*
doxygen

View File

@ -8,7 +8,7 @@ PROJECT_NAME = "rabit"
PROJECT_NUMBER =
PROJECT_BRIEF =
PROJECT_LOGO =
OUTPUT_DIRECTORY = ../doc
OUTPUT_DIRECTORY = ../doc/doxygen
CREATE_SUBDIRS = NO
OUTPUT_LANGUAGE = English
BRIEF_MEMBER_DESC = YES
@ -216,7 +216,7 @@ MAN_LINKS = NO
#---------------------------------------------------------------------------
# configuration options related to the XML output
#---------------------------------------------------------------------------
GENERATE_XML = NO
GENERATE_XML = YES
XML_OUTPUT = xml
XML_SCHEMA =
XML_DTD =

192
doc/Makefile Normal file
View File

@ -0,0 +1,192 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " applehelp to make an Apple Help Book"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
@echo " coverage to run coverage check of the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc"
applehelp:
$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
@echo
@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
@echo "N.B. You won't be able to view it unless you put it in" \
"~/Library/Documentation/Help or install it in your application" \
"bundle."
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/rabit"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
coverage:
$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
@echo "Testing of coverage in the sources finished, look at the " \
"results in $(BUILDDIR)/coverage/python.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."

184
doc/conf.py Normal file
View File

@ -0,0 +1,184 @@
# -*- coding: utf-8 -*-
#
# documentation build configuration file, created by
# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
import os, subprocess
import shlex
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
libpath = os.path.join(curr_path, '../wrapper/')
sys.path.insert(0, os.path.join(curr_path, '../wrapper/'))
sys.path.insert(0, curr_path)
from sphinx_util import MarkdownParser, AutoStructify
# -- General configuration ------------------------------------------------
# General information about the project.
project = u'rabit'
copyright = u'2015, rabit developers'
author = u'rabit developers'
github_doc_root = 'https://github.com/dmlc/rabit/tree/master/doc/'
# add markdown parser
MarkdownParser.github_doc_root = github_doc_root
source_parsers = {
'.md': MarkdownParser,
}
# Version information.
import rabit
version = rabit.__version__
release = rabit.__version__
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.napoleon',
'sphinx.ext.mathjax',
'breathe',
]
# Use breathe to include doxygen documents
breathe_projects = {'rabit' : 'doxygen/xml/'}
breathe_default_project = 'rabit'
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
# source_suffix = ['.rst', '.md']
source_suffix = ['.rst', '.md']
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
# html_theme = 'alabaster'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Output file base name for HTML help builder.
htmlhelp_basename = project + 'doc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'rabit.tex', project,
author, 'manual'),
]
# hook for doxygen
def run_doxygen(folder):
"""Run the doxygen make command in the designated folder."""
try:
retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True)
if retcode < 0:
sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
except OSError as e:
sys.stderr.write("doxygen execution failed: %s" % e)
def run_build_lib(folder):
"""Run the doxygen make command in the designated folder."""
try:
retcode = subprocess.call("cd %s; make" % folder, shell=True)
retcode = subprocess.call("rm -rf _build/html/doxygen", shell=True)
retcode = subprocess.call("mkdir _build", shell=True)
retcode = subprocess.call("mkdir _build/html", shell=True)
retcode = subprocess.call("cp -rf doxygen/html _build/html/doxygen", shell=True)
if retcode < 0:
sys.stderr.write("build terminated by signal %s" % (-retcode))
except OSError as e:
sys.stderr.write("build execution failed: %s" % e)
def generate_doxygen_xml(app):
"""Run the doxygen make commands if we're on the ReadTheDocs server"""
read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
if read_the_docs_build:
run_doxygen('..')
sys.stderr.write('Check if shared lib exists\n')
run_build_lib('..')
sys.stderr.write('The wrapper path: %s\n' % str(os.listdir('../wrapper')))
rabit._loadlib()
def setup(app):
# Add hook for building doxygen xml when needed
app.connect("builder-inited", generate_doxygen_xml)
app.add_config_value('recommonmark_config', {
'url_resolver': lambda url: github_doc_root + url,
}, True)
app.add_transform(AutoStructify)

9
doc/cpp_api.md Normal file
View File

@ -0,0 +1,9 @@
C++ Library API of Rabit
========================
This page contains document of Library API of rabit.
```eval_rst
.. toctree::
.. doxygennamespace:: rabit
```

View File

@ -1,10 +1,9 @@
Tutorial
=====
========
This is rabit's tutorial, a ***Reliable Allreduce and Broadcast Interface***.
All the example codes are in the [guide](https://github.com/dmlc/rabit/blob/master/guide/) folder of the project.
To run the examples locally, you will need to build them with ```make```.
Please also refer to the [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc) for further details.
**List of Topics**
* [What is Allreduce](#what-is-allreduce)
* [Common Use Case](#common-use-case)
@ -20,9 +19,9 @@ Please also refer to the [API Documentation](http://homes.cs.washington.edu/~tqc
* [Fault Tolerance](#fault-tolerance)
What is Allreduce
=====
-----------------
The main methods provided by rabit are Allreduce and Broadcast. Allreduce performs reduction across different computation nodes,
and returns the result to every node. To understand the behavior of the function, consider the following example in [basic.cc](basic.cc) (there is a python example right after this if you are more familiar with python).
and returns the result to every node. To understand the behavior of the function, consider the following example in [basic.cc](../guide/basic.cc) (there is a python example right after this if you are more familiar with python).
```c++
#include <rabit.h>
using namespace rabit;
@ -63,7 +62,7 @@ Rabit provides different reduction operators, for example, if you change ```op:
the reduction operation will be a summation, and the result will become ```a = {1, 3, 5}```.
You can also run the example with different processes by setting -n to different values.
If you are more familiar with python, you can also use rabit in python. The same example as before can be found in [basic.py](basic.py):
If you are more familiar with python, you can also use rabit in python. The same example as before can be found in [basic.py](../guide/basic.py):
```python
import numpy as np
@ -89,7 +88,7 @@ You can run the program using the following command
```
Broadcast is another method provided by rabit besides Allreduce. This function allows one node to broadcast its
local data to all other nodes. The following code in [broadcast.cc](broadcast.cc) broadcasts a string from
local data to all other nodes. The following code in [broadcast.cc](../guide/broadcast.cc) broadcasts a string from
node 0 to all other nodes.
```c++
#include <rabit.h>
@ -115,7 +114,7 @@ The following command starts the program with three worker processes.
```
Besides strings, rabit also allows to broadcast constant size array and vectors.
The counterpart in python can be found in [broadcast.py](broadcast.py). Here is a snippet so that you can get a better sense of how simple is to use the python library:
The counterpart in python can be found in [broadcast.py](../guide/broadcast.py). Here is a snippet so that you can get a better sense of how simple is to use the python library:
```python
import rabit
@ -132,7 +131,7 @@ rabit.finalize()
```
Common Use Case
=====
---------------
Many distributed machine learning algorithms involve splitting the data into different nodes,
computing statistics locally, and finally aggregating them. Such workflow is usually done repetitively through many iterations before the algorithm converges. Allreduce naturally meets the structure of such programs,
common use cases include:
@ -144,7 +143,7 @@ common use cases include:
Rabit is a reliable and portable library for distributed machine learning programs, that allow programs to run reliably on different platforms.
Use Rabit API
====
-------------
This section introduces topics about how to use rabit API.
You can always refer to [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc) for definition of each functions.
This section trys to gives examples of different aspectes of rabit API.
@ -202,8 +201,8 @@ into the data buffer, pass the data to Allreduce function, and get the reduced r
from failure, we can directly recover the result from other nodes(see also [Fault Tolerance](#fault-tolerance)) and
the data preparation procedure no longer necessary. Rabit Allreduce add an optional parameter preparation function
to support such scenario. User can pass in a function that corresponds to the data preparation procedure to Allreduce
calls, and the data preparation function will only be called when necessary. We use [lazy_allreduce.cc](lazy_allreduce.cc)
as an example to demonstrate this feature. It is modified from [basic.cc](basic.cc), and you can compare the two codes.
calls, and the data preparation function will only be called when necessary. We use [lazy_allreduce.cc](../guide/lazy_allreduce.cc)
as an example to demonstrate this feature. It is modified from [basic.cc](../guide/basic.cc), and you can compare the two codes.
```c++
#include <rabit.h>
using namespace rabit;
@ -242,7 +241,7 @@ the effect when a process goes down. You can run the program using the following
The additional arguments ```mock=0,0,1,0``` will cause node 0 to kill itself before second call of Allreduce (see also [mock test](#link-against-mock-test-rabit-library)).
You will find that the prepare function's print is only executed once and node 0 will no longer execute the preparation function when it restarts from failure.
You can also find python version of the example in [lazy_allreduce.py](lazy_allreduce.py), and run it using the followin command
You can also find python version of the example in [lazy_allreduce.py](../guide/lazy_allreduce.py), and run it using the followin command
```bash
../tracker/rabit_demo.py -n 2 lazy_allreduce.py mock=0,0,1,0
@ -250,7 +249,7 @@ You can also find python version of the example in [lazy_allreduce.py](lazy_allr
Since lazy preparation function may not be called during execution. User should be careful when using this feature. For example, a possible mistake
could be putting some memory allocation code in the lazy preparation function, and the computing memory was not allocated when lazy preparation function is not called.
The example in [lazy_allreduce.cc](lazy_allreduce.cc) provides a simple way to migrate normal prepration code([basic.cc](basic.cc)) to lazy version: wrap the preparation
The example in [lazy_allreduce.cc](../guide/lazy_allreduce.cc) provides a simple way to migrate normal prepration code([basic.cc](../guide/basic.cc)) to lazy version: wrap the preparation
code with a lambda function, and pass it to allreduce.
#### Checkpoint and LazyCheckpoint
@ -287,7 +286,7 @@ improve the efficiency of the program.
Compile Programs with Rabit
====
---------------------------
Rabit is a portable library, to use it, you only need to include the rabit header file.
* You will need to add the path to [../include](../include) to the header search path of the compiler
- Solution 1: add ```-I/path/to/rabit/include``` to the compiler flag in gcc or clang
@ -333,27 +332,27 @@ For example, consider the following script in the test case
- Note that ndeath = 1 means this will happen only if node 1 died once, which is our case
Running Rabit Jobs
====
------------------
Rabit is a portable library that can run on multiple platforms.
#### Running Rabit Locally
* You can use [../tracker/rabit_demo.py](../tracker/rabit_demo.py) to start n processes locally
* You can use [../tracker/rabit_demo.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_demo.py) to start n processes locally
* This script will restart the program when it exits with -2, so it can be used for [mock test](#link-against-mock-test-library)
#### Running Rabit on Hadoop
* You can use [../tracker/rabit_yarn.py](../tracker/rabit_yarn.py) to run rabit programs as Yarn application
* You can use [../tracker/rabit_yarn.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_yarn.py) to run rabit programs as Yarn application
* This will start rabit programs as yarn applications
- This allows multi-threading programs in each node, which can be more efficient
- An easy multi-threading solution could be to use OpenMP with rabit code
* It is also possible to run rabit program via hadoop streaming, however, YARN is highly recommended.
#### Running Rabit using MPI
* You can submit rabit programs to an MPI cluster using [../tracker/rabit_mpi.py](../tracker/rabit_mpi.py).
* You can submit rabit programs to an MPI cluster using [../tracker/rabit_mpi.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_mpi.py).
* If you linked your code against librabit_mpi.a, then you can directly use mpirun to submit the job
#### Customize Tracker Script
You can also modify the tracker script to allow rabit to run on other platforms. To do so, refer to existing
tracker scripts, such as [../tracker/rabit_hadoop.py](../tracker/rabit_hadoop.py) and [../tracker/rabit_mpi.py](../tracker/rabit_mpi.py) to get a sense of how it is done.
tracker scripts, such as [../tracker/rabit_yarn.py](../tracker/rabit_yarn.py) and [../tracker/rabit_mpi.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_mpi.py) to get a sense of how it is done.
You will need to implement a platform dependent submission function with the following definition
```python
@ -376,7 +375,7 @@ Note that the current rabit tracker does not restart a worker when it dies, the
- rabit-yarn provides such functionality in YARN
Fault Tolerance
=====
---------------
This section introduces how fault tolerance works in rabit.
The following figure shows how rabit deals with failures.

24
doc/index.md Normal file
View File

@ -0,0 +1,24 @@
Rabit Documentation
=====================
rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support **portable** , **scalable** and **reliable** distributed machine learning programs.
API Documents
-------------
```eval_rst
.. toctree::
:maxdepth: 2
python_api.md
cpp_api.md
parameters.md
guide.md
```
Indices and tables
------------------
```eval_rst
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
```

View File

@ -1,4 +0,0 @@
#!/bin/bash
cd ../include
doxygen ../doc/Doxyfile
cd ../doc

View File

@ -1,14 +1,7 @@
Rabit Documentation
====
* [Tutorial](../guide)
* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
- You can also run ```./mkdoc.sh``` to make the document locally
* [Parameters](#parameters)
Parameters
====
==========
This section list all the parameters that can be passed to rabit::Init function as argv.
All the parameters are passed in as string in format of ```parameter-name=parameter-value```.
All the parameters are passed in as string in format of ``parameter-name=parameter-value``.
In most setting these parameters have default value or will be automatically detected,
and do not need to be manually configured.

View File

@ -0,0 +1,4 @@
numpy
breathe
commonmark

11
doc/python_api.md Normal file
View File

@ -0,0 +1,11 @@
Python API of Rabit
===================
This page contains document of python API of rabit.
```eval_rst
.. toctree::
.. automodule:: rabit
:members:
:show-inheritance:
```

16
doc/sphinx_util.py Normal file
View File

@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
"""Helper utilty function for customization."""
import sys
import os
import docutils
import subprocess
if os.environ.get('READTHEDOCS', None) == 'True':
subprocess.call('cd ..; rm -rf recommonmark;' +
'git clone https://github.com/tqchen/recommonmark', shell=True)
sys.path.insert(0, os.path.abspath('../recommonmark/'))
from recommonmark import parser, transform
MarkdownParser = parser.CommonMarkParser
AutoStructify = transform.AutoStructify

1
guide/README Normal file
View File

@ -0,0 +1 @@
See tutorial at ../doc/guide.md

View File

@ -5,11 +5,17 @@
*
* \author Tianqi Chen
*/
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <vector>
#include <rabit.h>
using namespace rabit;
const int N = 3;
int main(int argc, char *argv[]) {
int a[N];
int N = 3;
if (argc > 1) {
N = atoi(argv[1]);
}
std::vector<int> a(N);
rabit::Init(argc, argv);
for (int i = 0; i < N; ++i) {
a[i] = rabit::GetRank() + i;

View File

@ -14,6 +14,7 @@
// include uint64_t only to make io standalone
#ifdef _MSC_VER
/*! \brief uint64 */
typedef unsigned __int64 uint64_t;
#else
#include <inttypes.h>
@ -24,7 +25,7 @@ namespace dmlc {
/*!
* \brief interface of stream I/O for serialization
*/
class Stream {
class Stream { // NOLINT(*)
public:
/*!
* \brief reads data from a stream
@ -192,9 +193,10 @@ class InputSplit {
* List of possible types: "text", "recordio"
* - "text":
* text file, each line is treated as a record
* input split will split on \n or \r
* input split will split on '\\n' or '\\r'
* - "recordio":
* binary recordio file, see recordio.h
* \return a new input split
* \sa InputSplit::Type
*/
static InputSplit* Create(const char *uri,
@ -224,7 +226,7 @@ class ostream : public std::basic_ostream<char> {
* \param buffer_size internal streambuf size
*/
explicit ostream(Stream *stream,
size_t buffer_size = 1 << 10)
size_t buffer_size = (1 << 10))
: std::basic_ostream<char>(NULL), buf_(buffer_size) {
this->set_stream(stream);
}
@ -287,7 +289,7 @@ class istream : public std::basic_istream<char> {
* \param buffer_size internal buffer size
*/
explicit istream(Stream *stream,
size_t buffer_size = 1 << 10)
size_t buffer_size = (1 << 10))
: std::basic_istream<char>(NULL), buf_(buffer_size) {
this->set_stream(stream);
}

View File

@ -8,12 +8,18 @@
* rabit.h and serializable.h is all what the user needs to use the rabit interface
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
*/
#ifndef RABIT_RABIT_H_
#define RABIT_RABIT_H_
#ifndef RABIT_RABIT_H_ // NOLINT(*)
#define RABIT_RABIT_H_ // NOLINT(*)
#include <string>
#include <vector>
// whether or not use c++11 support
#ifndef DMLC_USE_CXX11
#define DMLC_USE_CXX11 (defined(__GXX_EXPERIMENTAL_CXX0X__) ||\
__cplusplus >= 201103L || defined(_MSC_VER))
#endif
// optionally support of lambda functions in C++11, if available
#if __cplusplus >= 201103L
#if DMLC_USE_CXX11
#include <functional>
#endif // C++11
// contains definition of Serializable
@ -117,10 +123,13 @@ inline void Broadcast(std::string *sendrecv_data, int root);
* this function is NOT thread-safe
*
* Example Usage: the following code does an Allreduce and outputs the sum as the result
* \code{.cpp}
* vector<int> data(10);
* ...
* Allreduce<op::Sum>(&data[0], data.size());
* ...
* \endcode
*
* \param sendrecvbuf buffer for both sending and receiving data
* \param count number of elements to be reduced
* \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
@ -132,15 +141,17 @@ inline void Broadcast(std::string *sendrecv_data, int root);
*/
template<typename OP, typename DType>
inline void Allreduce(DType *sendrecvbuf, size_t count,
void (*prepare_fun)(void *arg) = NULL,
void (*prepare_fun)(void *) = NULL,
void *prepare_arg = NULL);
// C++11 support for lambda prepare function
#if __cplusplus >= 201103L
#if DMLC_USE_CXX11
/*!
* \brief performs in-place Allreduce, on sendrecvbuf
* with a prepare function specified by a lambda function
*
* Example Usage: the following code does an Allreduce and outputs the sum as the result
* Example Usage:
* \code{.cpp}
* // the following code does an Allreduce and outputs the sum as the result
* vector<int> data(10);
* ...
* Allreduce<op::Sum>(&data[0], data.size(), [&]() {
@ -149,6 +160,7 @@ inline void Allreduce(DType *sendrecvbuf, size_t count,
* }
* });
* ...
* \endcode
* \param sendrecvbuf buffer for both sending and receiving data
* \param count number of elements to be reduced
* \param prepare_fun Lazy lambda preprocessing function, prepare_fun() will be invoked
@ -173,14 +185,15 @@ inline void Allreduce(DType *sendrecvbuf, size_t count,
* if returned version == 0, this means no model has been CheckPointed
* the p_model is not touched, users should do the necessary initialization by themselves
*
* Common usage example:
* \code{.cpp}
* // Example usage code of LoadCheckPoint
* int iter = rabit::LoadCheckPoint(&model);
* if (iter == 0) model.InitParameters();
* for (i = iter; i < max_iter; ++i) {
* do many things, include allreduce
* // do many things, include allreduce
* rabit::CheckPoint(model);
* }
*
* \endcode
* \sa CheckPoint, VersionNumber
*/
inline int LoadCheckPoint(Serializable *global_model,
@ -242,7 +255,7 @@ class ReduceHandle;
* \tparam freduce the customized reduction function
* DType must be a struct, with no pointer
*/
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
class Reducer {
public:
Reducer(void);
@ -256,9 +269,9 @@ class Reducer {
* \param prepare_arg argument used to pass into the lazy preprocessing function
*/
inline void Allreduce(DType *sendrecvbuf, size_t count,
void (*prepare_fun)(void *arg) = NULL,
void (*prepare_fun)(void *) = NULL,
void *prepare_arg = NULL);
#if __cplusplus >= 201103L
#if DMLC_USE_CXX11
/*!
* \brief customized in-place all reduce operation, with lambda function as preprocessor
* \param sendrecvbuf pointer to the array of objects to be reduced
@ -300,10 +313,10 @@ class SerializeReducer {
*/
inline void Allreduce(DType *sendrecvobj,
size_t max_nbyte, size_t count,
void (*prepare_fun)(void *arg) = NULL,
void (*prepare_fun)(void *) = NULL,
void *prepare_arg = NULL);
// C++11 support for lambda prepare function
#if __cplusplus >= 201103L
#if DMLC_USE_CXX11
/*!
* \brief customized in-place all reduce operation, with lambda function as preprocessor
* \param sendrecvobj pointer to the array of objects to be reduced
@ -326,4 +339,4 @@ class SerializeReducer {
} // namespace rabit
// implementation of template functions
#include "./rabit/rabit-inl.h"
#endif // RABIT_RABIT_H_
#endif // RABIT_RABIT_H_ // NOLINT(*)

View File

@ -183,7 +183,9 @@ enum DataType {
kLong = 4,
kULong = 5,
kFloat = 6,
kDouble = 7
kDouble = 7,
kLongLong = 8,
kULongLong = 9
};
} // namespace mpi
/*!

View File

@ -4,8 +4,8 @@
* \brief utilities with different serializable implementations
* \author Tianqi Chen
*/
#ifndef RABIT_UTILS_IO_H_
#define RABIT_UTILS_IO_H_
#ifndef RABIT_IO_H_
#define RABIT_IO_H_
#include <cstdio>
#include <vector>
#include <cstring>
@ -51,6 +51,7 @@ struct MemoryFixSizeBuffer : public SeekStream {
virtual bool AtEnd(void) const {
return curr_ptr_ == buffer_size_;
}
private:
/*! \brief in memory buffer */
char *p_buffer_;
@ -93,6 +94,7 @@ struct MemoryBufferStream : public SeekStream {
virtual bool AtEnd(void) const {
return curr_ptr_ == p_buffer_->length();
}
private:
/*! \brief in memory buffer */
std::string *p_buffer_;
@ -101,4 +103,4 @@ struct MemoryBufferStream : public SeekStream {
}; // class MemoryBufferStream
} // namespace utils
} // namespace rabit
#endif // RABIT_UTILS_IO_H_
#endif // RABIT_IO_H_

View File

@ -1,12 +1,15 @@
/*!
* Copyright by Contributors
* \file rabit-inl.h
* \brief implementation of inline template function for rabit interface
*
* \author Tianqi Chen
*/
#ifndef RABIT_RABIT_INL_H
#define RABIT_RABIT_INL_H
#ifndef RABIT_RABIT_INL_H_
#define RABIT_RABIT_INL_H_
// use engine for implementation
#include <vector>
#include <string>
#include "./io.h"
#include "./utils.h"
#include "../rabit.h"
@ -30,15 +33,15 @@ inline DataType GetType<int>(void) {
return kInt;
}
template<>
inline DataType GetType<unsigned>(void) {
inline DataType GetType<unsigned int>(void) { // NOLINT(*)
return kUInt;
}
template<>
inline DataType GetType<long>(void) {
inline DataType GetType<long>(void) { // NOLINT(*)
return kLong;
}
template<>
inline DataType GetType<unsigned long>(void) {
inline DataType GetType<unsigned long>(void) { // NOLINT(*)
return kULong;
}
template<>
@ -49,42 +52,50 @@ template<>
inline DataType GetType<double>(void) {
return kDouble;
}
template<>
inline DataType GetType<long long>(void) { // NOLINT(*)
return kLongLong;
}
template<>
inline DataType GetType<unsigned long long>(void) { // NOLINT(*)
return kULongLong;
}
} // namespace mpi
} // namespace engine
namespace op {
struct Max {
const static engine::mpi::OpType kType = engine::mpi::kMax;
static const engine::mpi::OpType kType = engine::mpi::kMax;
template<typename DType>
inline static void Reduce(DType &dst, const DType &src) {
inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
if (dst < src) dst = src;
}
};
struct Min {
const static engine::mpi::OpType kType = engine::mpi::kMin;
static const engine::mpi::OpType kType = engine::mpi::kMin;
template<typename DType>
inline static void Reduce(DType &dst, const DType &src) {
inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
if (dst > src) dst = src;
}
};
struct Sum {
const static engine::mpi::OpType kType = engine::mpi::kSum;
static const engine::mpi::OpType kType = engine::mpi::kSum;
template<typename DType>
inline static void Reduce(DType &dst, const DType &src) {
inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
dst += src;
}
};
struct BitOR {
const static engine::mpi::OpType kType = engine::mpi::kBitwiseOR;
static const engine::mpi::OpType kType = engine::mpi::kBitwiseOR;
template<typename DType>
inline static void Reduce(DType &dst, const DType &src) {
inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
dst |= src;
}
};
template<typename OP, typename DType>
inline void Reducer(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
const DType *src = (const DType*)src_;
DType *dst = (DType*)dst_;
DType *dst = (DType*)dst_; // NOLINT(*)
for (int i = 0; i < len; ++i) {
OP::Reduce(dst[i], src[i]);
}
@ -151,7 +162,7 @@ inline void Allreduce(DType *sendrecvbuf, size_t count,
}
// C++11 support for lambda prepare function
#if __cplusplus >= 201103L
#if DMLC_USE_CXX11
inline void InvokeLambda_(void *fun) {
(*static_cast<std::function<void()>*>(fun))();
}
@ -215,15 +226,16 @@ inline void ReducerSafe_(const void *src_, void *dst_, int len_, const MPI::Data
}
}
// function to perform reduction for Reducer
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
inline void ReducerAlign_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
inline void ReducerAlign_(const void *src_, void *dst_,
int len_, const MPI::Datatype &dtype) {
const DType *psrc = reinterpret_cast<const DType*>(src_);
DType *pdst = reinterpret_cast<DType*>(dst_);
for (int i = 0; i < len_; ++i) {
freduce(pdst[i], psrc[i]);
}
}
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
inline Reducer<DType, freduce>::Reducer(void) {
// it is safe to directly use handle for aligned data types
if (sizeof(DType) == 8 || sizeof(DType) == 4 || sizeof(DType) == 1) {
@ -232,7 +244,7 @@ inline Reducer<DType, freduce>::Reducer(void) {
this->handle_.Init(ReducerSafe_<DType, freduce>, sizeof(DType));
}
}
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
void (*prepare_fun)(void *arg),
void *prepare_arg) {
@ -240,13 +252,14 @@ inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
}
// function to perform reduction for SerializeReducer
template<typename DType>
inline void SerializeReducerFunc_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
inline void SerializeReducerFunc_(const void *src_, void *dst_,
int len_, const MPI::Datatype &dtype) {
int nbytes = engine::ReduceHandle::TypeSize(dtype);
// temp space
DType tsrc, tdst;
for (int i = 0; i < len_; ++i) {
utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes);
utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes);
utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes); // NOLINT(*)
utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes); // NOLINT(*)
tsrc.Load(fsrc);
tdst.Load(fdst);
// govern const check
@ -298,8 +311,8 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
}
}
#if __cplusplus >= 201103L
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
#if DMLC_USE_CXX11
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)g
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
std::function<void()> prepare_fun) {
this->Allreduce(sendrecvbuf, count, InvokeLambda_, &prepare_fun);
@ -312,4 +325,4 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
}
#endif
} // namespace rabit
#endif
#endif // RABIT_RABIT_INL_H_

View File

@ -1,4 +1,5 @@
/*!
* Copyright by Contributors
* \file timer.h
* \brief This file defines the utils for timing
* \author Tianqi Chen, Nacho, Tianyi
@ -18,7 +19,6 @@ namespace utils {
* \brief return time in seconds, not cross platform, avoid to use this in most places
*/
inline double GetTime(void) {
// TODO: use c++11 chrono when c++11 was available
#ifdef __MACH__
clock_serv_t cclock;
mach_timespec_t mts;
@ -32,7 +32,6 @@ inline double GetTime(void) {
utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
#else
// TODO: add MSVC macro, and MSVC timer
return static_cast<double>(time(NULL));
#endif
#endif

View File

@ -163,7 +163,7 @@ inline std::FILE *FopenCheck(const char *fname, const char *flag) {
// easy utils that can be directly accessed in xgboost
/*! \brief get the beginning address of a vector */
template<typename T>
inline T *BeginPtr(std::vector<T> &vec) {
inline T *BeginPtr(std::vector<T> &vec) { // NOLINT(*)
if (vec.size() == 0) {
return NULL;
} else {
@ -172,14 +172,14 @@ inline T *BeginPtr(std::vector<T> &vec) {
}
/*! \brief get the beginning address of a vector */
template<typename T>
inline const T *BeginPtr(const std::vector<T> &vec) {
inline const T *BeginPtr(const std::vector<T> &vec) { // NOLINT(*)
if (vec.size() == 0) {
return NULL;
} else {
return &vec[0];
}
}
inline char* BeginPtr(std::string &str) {
inline char* BeginPtr(std::string &str) { // NOLINT(*)
if (str.length() == 0) return NULL;
return &str[0];
}

View File

@ -4,8 +4,8 @@
* \brief defines serializable interface of rabit
* \author Tianqi Chen
*/
#ifndef RABIT_RABIT_SERIALIZABLE_H_
#define RABIT_RABIT_SERIALIZABLE_H_
#ifndef RABIT_SERIALIZABLE_H_
#define RABIT_SERIALIZABLE_H_
#include <vector>
#include <string>
#include "./rabit/utils.h"
@ -24,4 +24,4 @@ typedef dmlc::Stream Stream;
typedef dmlc::Serializable Serializable;
} // namespace rabit
#endif // RABIT_RABIT_SERIALIZABLE_H_
#endif // RABIT_SERIALIZABLE_H_

8
scripts/travis_runtest.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/bash
make -f test.mk model_recover_10_10k || exit -1
make -f test.mk model_recover_10_10k_die_same || exit -1
make -f test.mk local_recover_10_10k || exit -1
make -f test.mk pylocal_recover_10_10k || exit -1
make -f test.mk lazy_recover_10_10k_die_hard || exit -1
make -f test.mk lazy_recover_10_10k_die_same || exit -1
make -f test.mk ringallreduce_10_10k || exit -1

22
scripts/travis_script.sh Executable file
View File

@ -0,0 +1,22 @@
#!/bin/bash
# main script of travis
if [ ${TASK} == "lint" ]; then
make lint || exit -1
fi
if [ ${TASK} == "doc" ]; then
make doc 2>log.txt
(cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag" |grep warning) && exit -1
fi
if [ ${TASK} == "build" ]; then
make all || exit -1
fi
if [ ${TASK} == "test" ]; then
cd test
make all || exit -1
../scripts/travis_runtest.sh || exit -1
fi

View File

@ -24,6 +24,7 @@ AllreduceBase::AllreduceBase(void) {
nport_trial = 1000;
rank = 0;
world_size = -1;
connect_retry = 5;
hadoop_mode = 0;
version_number = 0;
// 32 K items
@ -46,6 +47,7 @@ AllreduceBase::AllreduceBase(void) {
env_vars.push_back("DMLC_NUM_ATTEMPT");
env_vars.push_back("DMLC_TRACKER_URI");
env_vars.push_back("DMLC_TRACKER_PORT");
env_vars.push_back("DMLC_WORKER_CONNECT_RETRY");
}
// initialization function
@ -94,7 +96,8 @@ void AllreduceBase::Init(void) {
}
}
if (dmlc_role != "worker") {
fprintf(stderr, "Rabit Module currently only work with dmlc worker, quit this program by exit 0\n");
fprintf(stderr, "Rabit Module currently only work with dmlc worker"\
", quit this program by exit 0\n");
exit(0);
}
// clear the setting before start reconnection
@ -134,7 +137,7 @@ void AllreduceBase::TrackerPrint(const std::string &msg) {
// util to parse data with unit suffix
inline size_t ParseUnit(const char *name, const char *val) {
char unit;
unsigned long amt;
unsigned long amt; // NOLINT(*)
int n = sscanf(val, "%lu%c", &amt, &unit);
size_t amount = amt;
if (n == 2) {
@ -174,6 +177,9 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
if (!strcmp(name, "rabit_reduce_buffer")) {
reduce_buffer_size = (ParseUnit(name, val) + 7) >> 3;
}
if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) {
connect_retry = atoi(val);
}
}
/*!
* \brief initialize connection to the tracker
@ -184,9 +190,23 @@ utils::TCPSocket AllreduceBase::ConnectTracker(void) const {
// get information from tracker
utils::TCPSocket tracker;
tracker.Create();
int retry = 0;
do {
fprintf(stderr, "connect to ip: [%s]\n", tracker_uri.c_str());
if (!tracker.Connect(utils::SockAddr(tracker_uri.c_str(), tracker_port))) {
if (++retry >= connect_retry) {
fprintf(stderr, "connect to (failed): [%s]\n", tracker_uri.c_str());
utils::Socket::Error("Connect");
} else {
fprintf(stderr, "retry connect to ip(retry time %d): [%s]\n", retry, tracker_uri.c_str());
sleep(1);
continue;
}
}
break;
} while (1);
using utils::Assert;
Assert(tracker.SendAll(&magic, sizeof(magic)) == sizeof(magic),
"ReConnectLink failure 1");
@ -513,7 +533,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
if (len != -1) {
size_up_out += static_cast<size_t>(len);
} else {
ReturnType ret = Errno2Return(errno);
ReturnType ret = Errno2Return();
if (ret != kSuccess) {
return ReportError(&links[parent_index], ret);
}
@ -533,7 +553,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
utils::Assert(size_down_in <= size_up_out,
"Allreduce: boundary error");
} else {
ReturnType ret = Errno2Return(errno);
ReturnType ret = Errno2Return();
if (ret != kSuccess) {
return ReportError(&links[parent_index], ret);
}
@ -709,7 +729,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
if (len != -1) {
read_ptr += static_cast<size_t>(len);
} else {
ReturnType ret = Errno2Return(errno);
ReturnType ret = Errno2Return();
if (ret != kSuccess) return ReportError(&next, ret);
}
}
@ -723,7 +743,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
if (len != -1) {
write_ptr += static_cast<size_t>(len);
} else {
ReturnType ret = Errno2Return(errno);
ReturnType ret = Errno2Return();
if (ret != kSuccess) return ReportError(&prev, ret);
}
}
@ -826,7 +846,7 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
if (len != -1) {
write_ptr += static_cast<size_t>(len);
} else {
ReturnType ret = Errno2Return(errno);
ReturnType ret = Errno2Return();
if (ret != kSuccess) return ReportError(&prev, ret);
}
}

View File

@ -90,6 +90,7 @@ class AllreduceBase : public IEngine {
PreprocFunction prepare_fun = NULL,
void *prepare_arg = NULL) {
if (prepare_fun != NULL) prepare_fun(prepare_arg);
if (world_size == 1) return;
utils::Assert(TryAllreduce(sendrecvbuf_,
type_nbytes, count, reducer) == kSuccess,
"Allreduce failed");
@ -101,6 +102,7 @@ class AllreduceBase : public IEngine {
* \param root the root worker id to broadcast the data
*/
virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
if (world_size == 1) return;
utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
"Broadcast failed");
}
@ -223,7 +225,7 @@ class AllreduceBase : public IEngine {
ReturnTypeEnum value;
// constructor
ReturnType() {}
ReturnType(ReturnTypeEnum value) : value(value){}
ReturnType(ReturnTypeEnum value) : value(value) {} // NOLINT(*)
inline bool operator==(const ReturnTypeEnum &v) const {
return value == v;
}
@ -232,8 +234,13 @@ class AllreduceBase : public IEngine {
}
};
/*! \brief translate errno to return type */
inline static ReturnType Errno2Return(int errsv) {
if (errsv == EAGAIN || errsv == EWOULDBLOCK) return kSuccess;
inline static ReturnType Errno2Return() {
int errsv = utils::Socket::GetLastError();
if (errsv == EAGAIN || errsv == EWOULDBLOCK || errsv == 0) return kSuccess;
#ifdef _WIN32
if (errsv == WSAEWOULDBLOCK) return kSuccess;
if (errsv == WSAECONNRESET) return kConnReset;
#endif
if (errsv == ECONNRESET) return kConnReset;
return kSockError;
}
@ -297,7 +304,7 @@ class AllreduceBase : public IEngine {
if (len == 0) {
sock.Close(); return kRecvZeroLen;
}
if (len == -1) return Errno2Return(errno);
if (len == -1) return Errno2Return();
size_read += static_cast<size_t>(len);
return kSuccess;
}
@ -316,7 +323,7 @@ class AllreduceBase : public IEngine {
if (len == 0) {
sock.Close(); return kRecvZeroLen;
}
if (len == -1) return Errno2Return(errno);
if (len == -1) return Errno2Return();
size_read += static_cast<size_t>(len);
return kSuccess;
}
@ -329,7 +336,7 @@ class AllreduceBase : public IEngine {
inline ReturnType WriteFromArray(const void *sendbuf_, size_t max_size) {
const char *p = static_cast<const char*>(sendbuf_);
ssize_t len = sock.Send(p + size_write, max_size - size_write);
if (len == -1) return Errno2Return(errno);
if (len == -1) return Errno2Return();
size_write += static_cast<size_t>(len);
return kSuccess;
}
@ -512,7 +519,9 @@ class AllreduceBase : public IEngine {
int rank;
// world size
int world_size;
// connect retry time
int connect_retry;
};
} // namespace engine
} // namespace rabit
#endif // RABIT_ALLREDUCE_BASE_H
#endif // RABIT_ALLREDUCE_BASE_H_

View File

@ -1,4 +1,5 @@
/*!
* Copyright by Contributors
* \file allreduce_mock.h
* \brief Mock test module of AllReduce engine,
* insert failures in certain call point, to test if the engine is robust to failure
@ -100,6 +101,7 @@ class AllreduceMock : public AllreduceRobust {
this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "LazyCheckPoint");
AllreduceRobust::LazyCheckPoint(global_model);
}
protected:
// force checkpoint to local
int force_local;

View File

@ -5,8 +5,8 @@
*
* \author Tianqi Chen
*/
#ifndef RABIT_ENGINE_ROBUST_INL_H_
#define RABIT_ENGINE_ROBUST_INL_H_
#ifndef RABIT_ALLREDUCE_ROBUST_INL_H_
#define RABIT_ALLREDUCE_ROBUST_INL_H_
#include <vector>
namespace rabit {
@ -80,8 +80,16 @@ AllreduceRobust::MsgPassing(const NodeType &node_value,
selecter.WatchRead(links[i].sock);
}
break;
case 1: if (i == parent_index) selecter.WatchWrite(links[i].sock); break;
case 2: if (i == parent_index) selecter.WatchRead(links[i].sock); break;
case 1:
if (i == parent_index) {
selecter.WatchWrite(links[i].sock);
}
break;
case 2:
if (i == parent_index) {
selecter.WatchRead(links[i].sock);
}
break;
case 3:
if (i != parent_index && links[i].size_write != sizeof(EdgeType)) {
selecter.WatchWrite(links[i].sock);
@ -158,4 +166,4 @@ AllreduceRobust::MsgPassing(const NodeType &node_value,
}
} // namespace engine
} // namespace rabit
#endif // RABIT_ENGINE_ROBUST_INL_H_
#endif // RABIT_ALLREDUCE_ROBUST_INL_H_

View File

@ -224,7 +224,7 @@ void AllreduceRobust::LocalModelCheck(bool with_local) {
num_local_replica = 0;
}
} else {
utils::Check(use_local_model == int(with_local),
utils::Check(use_local_model == static_cast<int>(with_local),
"Can only call Checkpoint/LoadCheckPoint always with"\
"or without local_model, but not mixed case");
}
@ -691,7 +691,7 @@ AllreduceRobust::TryRecoverData(RecoverType role,
if (len != -1) {
links[i].size_write += len;
} else {
ReturnType ret = Errno2Return(errno);
ReturnType ret = Errno2Return();
if (ret != kSuccess) return ReportError(&links[i], ret);
}
}
@ -1161,7 +1161,7 @@ AllreduceRobust::RingPassing(void *sendrecvbuf_,
if (len != -1) {
read_ptr += static_cast<size_t>(len);
} else {
ReturnType ret = Errno2Return(errno);
ReturnType ret = Errno2Return();
if (ret != kSuccess) return ReportError(&prev, ret);
}
}
@ -1171,7 +1171,7 @@ AllreduceRobust::RingPassing(void *sendrecvbuf_,
if (len != -1) {
write_ptr += static_cast<size_t>(len);
} else {
ReturnType ret = Errno2Return(errno);
ReturnType ret = Errno2Return();
if (ret != kSuccess) return ReportError(&prev, ret);
}
}

View File

@ -287,6 +287,7 @@ class AllreduceRobust : public AllreduceBase {
if (seqno_.size() == 0) return -1;
return seqno_.back();
}
private:
// sequence number of each
std::vector<int> seqno_;

View File

@ -110,6 +110,8 @@ inline MPI::Datatype GetType(mpi::DataType dtype) {
case kULong: return MPI::UNSIGNED_LONG;
case kFloat: return MPI::FLOAT;
case kDouble: return MPI::DOUBLE;
case kLongLong: return MPI::LONG_LONG;
case kULongLong: return MPI::UNSIGNED_LONG_LONG;
}
utils::Error("unknown mpi::DataType");
return MPI::CHAR;
@ -164,7 +166,7 @@ void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
if (type_nbytes != 0) {
MPI::Datatype *dtype = new MPI::Datatype();
if (type_nbytes % 8 == 0) {
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long)); // NOLINT(*)
} else if (type_nbytes % 4 == 0) {
*dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
} else {
@ -193,7 +195,7 @@ void ReduceHandle::Allreduce(void *sendrecvbuf,
dtype->Free();
}
if (type_nbytes % 8 == 0) {
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long)); // NOLINT(*)
} else if (type_nbytes % 4 == 0) {
*dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
} else {

View File

@ -94,6 +94,25 @@ class Socket {
inline operator SOCKET() const {
return sockfd;
}
/*!
* \return last error of socket operation
*/
inline static int GetLastError(void) {
#ifdef _WIN32
return WSAGetLastError();
#else
return errno;
#endif
}
/*! \return whether last error was would block */
inline static bool LastErrorWouldBlock(void) {
int errsv = GetLastError();
#ifdef _WIN32
return errsv == WSAEWOULDBLOCK;
#else
return errsv == EAGAIN || errsv == EWOULDBLOCK;
#endif
}
/*!
* \brief start up the socket module
* call this before using the sockets
@ -216,8 +235,12 @@ class Socket {
}
// report an socket error
inline static void Error(const char *msg) {
int errsv = errno;
int errsv = GetLastError();
#ifdef _WIN32
utils::Error("Socket %s Error:WSAError-code=%d", msg, errsv);
#else
utils::Error("Socket %s Error:%s", msg, strerror(errsv));
#endif
}
protected:
@ -241,7 +264,8 @@ class TCPSocket : public Socket{
*/
inline void SetKeepAlive(bool keepalive) {
int opt = static_cast<int>(keepalive);
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
Socket::Error("SetKeepAlive");
}
}
@ -276,7 +300,7 @@ class TCPSocket : public Socket{
*/
inline int AtMark(void) const {
#ifdef _WIN32
unsigned long atmark;
unsigned long atmark; // NOLINT(*)
if (ioctlsocket(sockfd, SIOCATMARK, &atmark) != NO_ERROR) return -1;
#else
int atmark;
@ -330,7 +354,7 @@ class TCPSocket : public Socket{
while (ndone < len) {
ssize_t ret = send(sockfd, buf, static_cast<ssize_t>(len - ndone), 0);
if (ret == -1) {
if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
if (LastErrorWouldBlock()) return ndone;
Socket::Error("SendAll");
}
buf += ret;
@ -352,7 +376,7 @@ class TCPSocket : public Socket{
ssize_t ret = recv(sockfd, buf,
static_cast<sock_size_t>(len - ndone), MSG_WAITALL);
if (ret == -1) {
if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
if (LastErrorWouldBlock()) return ndone;
Socket::Error("RecvAll");
}
if (ret == 0) return ndone;
@ -450,7 +474,7 @@ struct SelectHelper {
* \param timeout the timeout counter, can be 0, which means wait until the event happen
* \return 1 if success, 0 if timeout, and -1 if error occurs
*/
inline static int WaitExcept(SOCKET fd, long timeout = 0) {
inline static int WaitExcept(SOCKET fd, long timeout = 0) { // NOLINT(*)
fd_set wait_set;
FD_ZERO(&wait_set);
FD_SET(fd, &wait_set);
@ -466,7 +490,7 @@ struct SelectHelper {
* \return number of active descriptors selected,
* return -1 if error occurs
*/
inline int Select(long timeout = 0) {
inline int Select(long timeout = 0) { // NOLINT(*)
int ret = Select_(static_cast<int>(maxfd + 1),
&read_set, &write_set, &except_set, timeout);
if (ret == -1) {
@ -477,7 +501,7 @@ struct SelectHelper {
private:
inline static int Select_(int maxfd, fd_set *rfds,
fd_set *wfds, fd_set *efds, long timeout) {
fd_set *wfds, fd_set *efds, long timeout) { // NOLINT(*)
#if !defined(_WIN32)
utils::Assert(maxfd < FD_SETSIZE, "maxdf must be smaller than FDSETSIZE");
#endif

View File

@ -2,7 +2,7 @@ export CC = gcc
export CXX = g++
export MPICXX = mpicxx
export LDFLAGS= -L../lib -pthread -lm -lrt
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++11
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++0x
# specify tensor path
BIN = speed_test model_recover local_recover lazy_recover

View File

@ -1,7 +1,7 @@
# this is a makefile used to show testcases of rabit
.PHONY: all
all:
all: model_recover_10_10k model_recover_10_10k_die_same
# this experiment test recovery with actually process exit, use keepalive to keep program alive
model_recover_10_10k:

View File

@ -132,7 +132,7 @@ class Tracker:
break
except socket.error:
continue
sock.listen(16)
sock.listen(128)
self.sock = sock
self.verbose = verbose
if hostIP == 'auto':

View File

@ -100,6 +100,7 @@
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>

View File

@ -1,8 +1,9 @@
"""
Python interface for rabit
Reliable Allreduce and Broadcast Library
Reliable Allreduce and Broadcast Library.
Author: Tianqi Chen
"""
# pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
import cPickle as pickle
import ctypes
import os
@ -10,34 +11,41 @@ import sys
import warnings
import numpy as np
# version information about the doc
__version__ = '1.0'
if os.name == 'nt':
WRAPPER_PATH = os.path.dirname(__file__) + '\\..\\windows\\x64\\Release\\rabit_wrapper%s.dll'
else:
WRAPPER_PATH = os.path.dirname(__file__) + '/librabit_wrapper%s.so'
rbtlib = None
_LIB = None
# load in xgboost library
def loadlib__(lib = 'standard'):
global rbtlib
if rbtlib != None:
warnings.Warn('rabit.int call was ignored because it has already been initialized', level = 2)
def _loadlib(lib='standard'):
"""Load rabit library."""
global _LIB
if _LIB != None:
warnings.warn('rabit.int call was ignored because it has'\
' already been initialized', level=2)
return
if lib == 'standard':
rbtlib = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '')
_LIB = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '')
elif lib == 'mock':
rbtlib = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mock')
_LIB = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mock')
elif lib == 'mpi':
rbtlib = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mpi')
_LIB = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mpi')
else:
raise Exception('unknown rabit lib %s, can be standard, mock, mpi' % lib)
rbtlib.RabitGetRank.restype = ctypes.c_int
rbtlib.RabitGetWorldSize.restype = ctypes.c_int
rbtlib.RabitVersionNumber.restype = ctypes.c_int
_LIB.RabitGetRank.restype = ctypes.c_int
_LIB.RabitGetWorldSize.restype = ctypes.c_int
_LIB.RabitVersionNumber.restype = ctypes.c_int
def unloadlib__():
global rbtlib
del rbtlib
rbtlib = None
def _unloadlib():
"""Unload rabit library."""
global _LIB
del _LIB
_LIB = None
# reduction operators
MAX = 0
@ -45,101 +53,98 @@ MIN = 1
SUM = 2
BITOR = 3
def check_err__():
"""
reserved function used to check error
"""
return
def init(args=None, lib='standard'):
"""Intialize the rabit module, call this once before using anything.
def init(args = sys.argv, lib = 'standard'):
Parameters
----------
args: list of str, optional
The list of arguments used to initialized the rabit
usually you need to pass in sys.argv.
Defaults to sys.argv when it is None.
lib: {'standard', 'mock', 'mpi'}
Type of library we want to load
"""
intialize the rabit module, call this once before using anything
Arguments:
args: list(string) [default=sys.argv]
the list of arguments used to initialized the rabit
usually you need to pass in sys.argv
with_mock: boolean [default=False]
Whether initialize the mock test module
"""
loadlib__(lib)
if args is None:
args = sys.argv
_loadlib(lib)
arr = (ctypes.c_char_p * len(args))()
arr[:] = args
rbtlib.RabitInit(len(args), arr)
check_err__()
_LIB.RabitInit(len(args), arr)
def finalize():
"""Finalize the rabit engine.
Call this function after you finished all jobs.
"""
finalize the rabit engine, call this function after you finished all jobs
"""
rbtlib.RabitFinalize()
check_err__()
unloadlib__()
_LIB.RabitFinalize()
_unloadlib()
def get_rank():
"""Get rank of current process.
Returns
-------
rank : int
Rank of current process.
"""
Returns rank of current process
"""
ret = rbtlib.RabitGetRank()
check_err__()
ret = _LIB.RabitGetRank()
return ret
def get_world_size():
"""Get total number workers.
Returns
-------
n : int
Total number of process.
"""
Returns get total number of process
"""
ret = rbtlib.RabitGetWorldSize()
check_err__()
ret = _LIB.RabitGetWorldSize()
return ret
def tracker_print(msg):
"""
print message to the tracker
this function can be used to communicate the information of the progress
to the tracker
"""Print message to the tracker.
This function can be used to communicate the information of
the progress to the tracker
Parameters
----------
msg : str
The message to be printed to tracker.
"""
if not isinstance(msg, str):
msg = str(msg)
rbtlib.RabitTrackerPrint(ctypes.c_char_p(msg).encode('utf-8'))
check_err__()
_LIB.RabitTrackerPrint(ctypes.c_char_p(msg).encode('utf-8'))
def get_processor_name():
"""
Returns the name of processor(host)
"""Get the processor name.
Returns
-------
name : str
the name of processor(host)
"""
mxlen = 256
length = ctypes.c_ulong()
buf = ctypes.create_string_buffer(mxlen)
rbtlib.RabitGetProcessorName(buf, ctypes.byref(length),
mxlen)
check_err__()
_LIB.RabitGetProcessorName(buf, ctypes.byref(length), mxlen)
return buf.value
def broadcast(data, root):
"""
broadcast object from one node to all other nodes
this function will return the broadcasted object
"""Broadcast object from one node to all other nodes.
Example: the following example broadcast hello from rank 0 to all other nodes
```python
rabit.init()
n = 3
rank = rabit.get_rank()
s = None
if rank == 0:
s = {'hello world':100, 2:3}
print '@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s))
s = rabit.broadcast(s, 0)
print '@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s))
rabit.finalize()
```
Arguments:
Parameters
----------
data : any type that can be pickled
input data, if current rank does not equal root, this can be None
Input data, if current rank does not equal root, this can be None
root : int
rank of the node to broadcast data from
Returns:
the result of broadcast
Rank of the node to broadcast data from.
Returns
-------
object : int
the result of broadcast.
"""
rank = get_rank()
length = ctypes.c_ulong()
@ -148,22 +153,18 @@ def broadcast(data, root):
s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
length.value = len(s)
# run first broadcast
rbtlib.RabitBroadcast(ctypes.byref(length),
ctypes.sizeof(ctypes.c_ulong),
root)
check_err__()
_LIB.RabitBroadcast(ctypes.byref(length),
ctypes.sizeof(ctypes.c_ulong), root)
if root != rank:
dptr = (ctypes.c_char * length.value)()
# run second
rbtlib.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
_LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
length.value, root)
check_err__()
data = pickle.loads(dptr.raw)
del dptr
else:
rbtlib.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
_LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
length.value, root)
check_err__()
del s
return data
@ -180,19 +181,28 @@ DTYPE_ENUM__ = {
}
def allreduce(data, op, prepare_fun=None):
"""
perform allreduce, return the result, this function is not thread-safe
Arguments:
data: numpy ndarray
input data
"""Perform allreduce, return the result.
Parameters
----------
data: numpy array
Input data.
op: int
reduction operators, can be MIN, MAX, SUM, BITOR
prepare_fun: lambda data
Reduction operators, can be MIN, MAX, SUM, BITOR
prepare_fun: function
Lazy preprocessing function, if it is not None, prepare_fun(data)
will be called by the function before performing allreduce, to intialize the data
If the result of Allreduce can be recovered directly, then prepare_fun will NOT be called
Returns:
the result of allreduce, have same shape as data
If the result of Allreduce can be recovered directly,
then prepare_fun will NOT be called
Returns
-------
result : array_like
The result of allreduce, have same shape as data
Notes
-----
This function is not thread-safe.
"""
if not isinstance(data, np.ndarray):
raise Exception('allreduce only takes in numpy.ndarray')
@ -202,21 +212,21 @@ def allreduce(data, op, prepare_fun = None):
if buf.dtype not in DTYPE_ENUM__:
raise Exception('data type %s not supported' % str(buf.dtype))
if prepare_fun is None:
rbtlib.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
buf.size, DTYPE_ENUM__[buf.dtype],
op, None, None)
else:
PFUNC = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def pfunc(args):
"""prepare function."""
prepare_fun(data)
rbtlib.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
buf.size, DTYPE_ENUM__[buf.dtype],
op, PFUNC(pfunc), None)
check_err__()
op, func_ptr(pfunc), None)
return buf
def load_model__(ptr, length):
def _load_model(ptr, length):
"""
Internal function used by the module,
unpickle a model from a buffer specified by ptr, length
@ -230,77 +240,88 @@ def load_model__(ptr, length):
return pickle.loads(data.raw)
def load_checkpoint(with_local=False):
"""
load latest check point
Arguments:
with_local: boolean [default = False]
"""Load latest check point.
Parameters
----------
with_local: bool, optional
whether the checkpoint contains local model
Returns:
Returns
-------
tuple : tuple
if with_local: return (version, gobal_model, local_model)
else return (version, gobal_model)
if returned version == 0, this means no model has been CheckPointed
and global_model, local_model returned will be None
"""
gp = ctypes.POINTER(ctypes.c_char)()
gptr = ctypes.POINTER(ctypes.c_char)()
global_len = ctypes.c_ulong()
if with_local:
lp = ctypes.POINTER(ctypes.c_char)()
lptr = ctypes.POINTER(ctypes.c_char)()
local_len = ctypes.c_ulong()
version = rbtlib.RabitLoadCheckPoint(
ctypes.byref(gp),
version = _LIB.RabitLoadCheckPoint(
ctypes.byref(gptr),
ctypes.byref(global_len),
ctypes.byref(lp),
ctypes.byref(lptr),
ctypes.byref(local_len))
check_err__()
if version == 0:
return (version, None, None)
return (version,
load_model__(gp, global_len.value),
load_model__(lp, local_len.value))
_load_model(gptr, global_len.value),
_load_model(lptr, local_len.value))
else:
version = rbtlib.RabitLoadCheckPoint(
ctypes.byref(gp),
version = _LIB.RabitLoadCheckPoint(
ctypes.byref(gptr),
ctypes.byref(global_len),
None, None)
check_err__()
if version == 0:
return (version, None)
return (version,
load_model__(gp, global_len.value))
_load_model(gptr, global_len.value))
def checkpoint(global_model, local_model=None):
"""
checkpoint the model, meaning we finished a stage of execution
every time we call check point, there is a version number which will increase by one
"""Checkpoint the model.
Arguments:
This means we finished a stage of execution.
Every time we call check point, there is a version number which will increase by one.
Parameters
----------
global_model: anytype that can be pickled
globally shared model/state when calling this function,
the caller need to gauranttees that global_model is the same in all nodes
local_model: anytype that can be pickled
local model, that is specific to current node/rank.
Local model, that is specific to current node/rank.
This can be None when no local state is needed.
local_model requires explicit replication of the model for fault-tolerance,
which will bring replication cost in checkpoint function,
Notes
-----
local_model requires explicit replication of the model for fault-tolerance.
This will bring replication cost in checkpoint function.
while global_model do not need explicit replication.
It is recommended to use global_model if possible
It is recommended to use global_model if possible.
"""
sg = pickle.dumps(global_model)
sglobal = pickle.dumps(global_model)
if local_model is None:
rbtlib.RabitCheckPoint(sg, len(sg), None, 0)
check_err__()
del sg;
_LIB.RabitCheckPoint(sglobal, len(sglobal), None, 0)
del sglobal
else:
sl = pickle.dumps(local_model)
rbtlib.RabitCheckPoint(sg, len(sg), sl, len(sl))
check_err__()
del sl; del sg;
slocal = pickle.dumps(local_model)
_LIB.RabitCheckPoint(sglobal, len(sglobal), slocal, len(slocal))
del slocal
del sglobal
def version_number():
"""Returns version number of current stored model.
This means how many calls to CheckPoint we made so far.
Returns
-------
version : int
Version number of currently stored model
"""
Returns version number of current stored model,
which means how many calls to CheckPoint we made so far
"""
ret = rbtlib.RabitVersionNumber()
check_err__()
ret = _LIB.RabitVersionNumber()
return ret

View File

@ -1,3 +1,4 @@
// Copyright by Contributors
// implementations in ctypes
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
@ -60,12 +61,12 @@ inline void Allreduce_(void *sendrecvbuf_,
return;
case kLong:
rabit::Allreduce<OP>
(static_cast<long*>(sendrecvbuf_),
(static_cast<long*>(sendrecvbuf_), // NOLINT(*)
count, prepare_fun, prepare_arg);
return;
case kULong:
rabit::Allreduce<OP>
(static_cast<unsigned long*>(sendrecvbuf_),
(static_cast<unsigned long*>(sendrecvbuf_), // NOLINT(*)
count, prepare_fun, prepare_arg);
return;
case kFloat:
@ -179,7 +180,7 @@ extern "C" {
if (s.length() > max_len) {
s.resize(max_len - 1);
}
strcpy(out_name, s.c_str());
strcpy(out_name, s.c_str()); // NOLINT(*)
*out_len = static_cast<rbt_ulong>(s.length());
}
void RabitBroadcast(void *sendrecv_data,

View File

@ -1,18 +1,19 @@
#ifndef RABIT_WRAPPER_H_
#define RABIT_WRAPPER_H_
/*!
* Copyright by Contributors
* \file rabit_wrapper.h
* \author Tianqi Chen
* \brief a C style wrapper of rabit
* can be used to create wrapper of other languages
*/
#ifndef RABIT_WRAPPER_H_
#define RABIT_WRAPPER_H_
#ifdef _MSC_VER
#define RABIT_DLL __declspec(dllexport)
#else
#define RABIT_DLL
#endif
// manually define unsign long
typedef unsigned long rbt_ulong;
typedef unsigned long rbt_ulong; // NOLINT(*)
#ifdef __cplusplus
extern "C" {
@ -122,4 +123,4 @@ extern "C" {
#ifdef __cplusplus
} // C
#endif
#endif // XGBOOST_WRAPPER_H_
#endif // RABIT_WRAPPER_H_