diff --git a/Makefile b/Makefile
index aa9bf632f..c9e35e80c 100644
--- a/Makefile
+++ b/Makefile
@@ -169,7 +169,7 @@ Rcheck:
 
 # lint requires dmlc to be in current folder
 lint:
-	dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package
+	dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package python-package
 
 clean:
 	$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
diff --git a/demo/.gitignore b/demo/.gitignore
index e52797d15..ee79c704b 100644
--- a/demo/.gitignore
+++ b/demo/.gitignore
@@ -1 +1,2 @@
-*.libsvm
\ No newline at end of file
+*.libsvm
+*.pkl
diff --git a/demo/README.md b/demo/README.md
index 49e9e52b8..fcfaa8434 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -1,14 +1,14 @@
 XGBoost Examples
 ====
-This folder contains all the code examples using xgboost. 
+This folder contains all the code examples using xgboost.
 
 * Contribution of examples, benchmarks is more than welcome!
 * If you like to share how you use xgboost to solve your problem, send a pull request:)
- 
+
 Features Walkthrough
 ====
-This is a list of short codes introducing different functionalities of xgboost and its wrapper.
-* Basic walkthrough of wrappers 
+This is a list of short codes introducing different functionalities of xgboost packages.
+* Basic walkthrough of packages
   [python](guide-python/basic_walkthrough.py)
   [R](../R-package/demo/basic_walkthrough.R)
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
@@ -20,18 +20,18 @@ This is a list of short codes introducing different functionalities of xgboost a
   [python](guide-python/boost_from_prediction.py)
   [R](../R-package/demo/boost_from_prediction.R)
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
-* Predicting using first n trees 
+* Predicting using first n trees
   [python](guide-python/predict_first_ntree.py)
   [R](../R-package/demo/boost_from_prediction.R)
-  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)  
+  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
 * Generalized Linear Model
   [python](guide-python/generalized_linear_model.py)
   [R](../R-package/demo/generalized_linear_model.R)
-  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl)  
+  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl)
 * Cross validation
   [python](guide-python/cross_validation.py)
   [R](../R-package/demo/cross_validation.R)
-  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)  
+  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)
 * Predicting leaf indices
   [python](guide-python/predict_leaf_indices.py)
   [R](../R-package/demo/predict_leaf_indices.R)
@@ -48,5 +48,5 @@ However, the parameter settings can be applied to all versions
 Benchmarks
 ====
 * [Starter script for Kaggle Higgs Boson](kaggle-higgs)
-* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) 
+* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
 
diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py
index 56fed1dd2..7ce95b491 100755
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@@ -75,13 +75,3 @@ clf = xgb.XGBClassifier()
 clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
         eval_set=[(X_test, y_test)])
 
-# Custom evaluation function
-from sklearn.metrics import log_loss
-
-
-def log_loss_eval(y_pred, y_true):
-    return "log-loss", log_loss(y_true.get_label(), y_pred)
-
-
-clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=log_loss_eval,
-        eval_set=[(X_test, y_test)])
diff --git a/doc/python.md b/doc/python.md
index dfe886fe9..93b5c43d4 100644
--- a/doc/python.md
+++ b/doc/python.md
@@ -14,7 +14,7 @@ A [walk through python example](https://github.com/tqchen/xgboost/blob/master/de
 =
 #### Install
 
-To install XGBoost, you need to run `make` in the root directory of the project and then in the `wrappers` directory run
+To install XGBoost, you need to run `make` in the root directory of the project and then in the `python-package` directory run
 
 ```shell
 python setup.py install
diff --git a/python-package/.gitignore b/python-package/.gitignore
new file mode 100644
index 000000000..d765c67c7
--- /dev/null
+++ b/python-package/.gitignore
@@ -0,0 +1,3 @@
+build
+dist
+*.egg*
\ No newline at end of file
diff --git a/python-package/README.md b/python-package/README.md
new file mode 100644
index 000000000..a4ac71d4d
--- /dev/null
+++ b/python-package/README.md
@@ -0,0 +1,7 @@
+XGBoost Python Package
+======================
+* To make the python module, type ```./build.sh``` in the root directory of project
+* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools)
+* Install with `python setup.py install` from this directory.
+* Refer also to the walk through example in [demo folder](../demo/guide-python)
+* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo.
diff --git a/python-package/setup.py b/python-package/setup.py
new file mode 100644
index 000000000..42e39f3ba
--- /dev/null
+++ b/python-package/setup.py
@@ -0,0 +1,21 @@
+# pylint: disable=invalid-name
+"""Setup xgboost package."""
+from __future__ import absolute_import
+import sys
+from setuptools import setup
+sys.path.insert(0, '.')
+import xgboost
+
+LIB_PATH = xgboost.core.find_lib_path()
+
+setup(name='xgboost',
+      version=xgboost.__version__,
+      description=xgboost.__doc__,
+      install_requires=[
+          'numpy',
+          'scipy',
+      ],
+      zip_safe=False,
+      packages=['xgboost'],
+      data_files=[('xgboost', [LIB_PATH[0]])],
+      url='https://github.com/dmlc/xgboost')
diff --git a/wrapper/xgboost.py b/python-package/xgboost/core.py
similarity index 50%
rename from wrapper/xgboost.py
rename to python-package/xgboost/core.py
index 32f9a52b4..85017cb82 100644
--- a/wrapper/xgboost.py
+++ b/python-package/xgboost/core.py
@@ -1,17 +1,10 @@
 # coding: utf-8
-"""
-xgboost: eXtreme Gradient Boosting library
-
-Version: 0.40
-Authors: Tianqi Chen, Bing Xu
-Early stopping by Zygmunt Zając
-"""
-# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name, fixme
+# pylint: disable=too-many-arguments
+"""Core XGBoost Library."""
 from __future__ import absolute_import
 
 import os
 import sys
-import re
 import ctypes
 import platform
 import collections
@@ -19,13 +12,6 @@ import collections
 import numpy as np
 import scipy.sparse
 
-try:
-    from sklearn.base import BaseEstimator
-    from sklearn.base import RegressorMixin, ClassifierMixin
-    from sklearn.preprocessing import LabelEncoder
-    SKLEARN_INSTALLED = True
-except ImportError:
-    SKLEARN_INSTALLED = False
 
 class XGBoostLibraryNotFound(Exception):
     """Error throwed by when xgboost is not found"""
@@ -35,7 +21,6 @@ class XGBoostError(Exception):
     """Error throwed by xgboost trainer."""
     pass
 
-__all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train']
 
 if sys.version_info[0] == 3:
     # pylint: disable=invalid-name
@@ -44,30 +29,43 @@ else:
     # pylint: disable=invalid-name
     STRING_TYPES = basestring,
 
-def load_xglib():
-    """Load the xgboost library."""
+
+def find_lib_path():
+    """Load find the path to xgboost dynamic library files.
+
+    Returns
+    -------
+    lib_path: list(string)
+       List of all found library path to xgboost
+    """
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    dll_path = [curr_path]
+    dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/')]
     if os.name == 'nt':
         if platform.architecture()[0] == '64bit':
-            dll_path.append(os.path.join(curr_path, '../windows/x64/Release/'))
+            dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/'))
         else:
-            dll_path.append(os.path.join(curr_path, '../windows/Release/'))
+            dll_path.append(os.path.join(curr_path, '../../windows/Release/'))
     if os.name == 'nt':
         dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
     else:
         dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
     lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
-    if len(dll_path) == 0:
+    if len(lib_path) == 0:
         raise XGBoostLibraryNotFound(
-            'cannot find find the files in the candicate path ' + str(dll_path))
+            'Cannot find XGBoost Libarary in the candicate path %s,' +
+            'Did you run build.sh in root oath?' % str(dll_path))
+    return lib_path
+
+def _load_lib():
+    """Load xgboost Library."""
+    lib_path = find_lib_path()
     lib = ctypes.cdll.LoadLibrary(lib_path[0])
     lib.XGBGetLastError.restype = ctypes.c_char_p
 
     return lib
 
 # load the XGBoost library globally
-_LIB = load_xglib()
+_LIB = _load_lib()
 
 def _check_call(ret):
     """Check the return value of C API call
@@ -117,7 +115,11 @@ def c_array(ctype, values):
 
 
 class DMatrix(object):
-    """Data Matrix used in XGBoost."""
+    """Data Matrix used in XGBoost.
+
+    DMatrix is a internal data structure that used by XGBoost
+    which is optimized for both memory efficiency and training speed.
+    """
     def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
         """
         Data matrix used in XGBoost.
@@ -400,11 +402,14 @@ class DMatrix(object):
 
 
 class Booster(object):
-    """"A Booster of of XGBoost."""
+    """"A Booster of of XGBoost.
+
+    Booster is the model of xgboost, that contains low level routines for
+    training, prediction and evaluation.
+    """
     def __init__(self, params=None, cache=(), model_file=None):
         # pylint: disable=invalid-name
-        """
-        Learner class.
+        """Initialize the Booster.
 
         Parameters
         ----------
@@ -735,570 +740,3 @@ class Booster(object):
                 else:
                     fmap[fid] += 1
         return fmap
-
-
-def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
-          early_stopping_rounds=None, evals_result=None, verbose_eval=True):
-    # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
-    """Train a booster with given parameters.
-
-    Parameters
-    ----------
-    params : dict
-        Booster params.
-    dtrain : DMatrix
-        Data to be trained.
-    num_boost_round: int
-        Number of boosting iterations.
-    watchlist (evals): list of pairs (DMatrix, string)
-        List of items to be evaluated during training, this allows user to watch
-        performance on the validation set.
-    obj : function
-        Customized objective function.
-    feval : function
-        Customized evaluation function.
-    early_stopping_rounds: int
-        Activates early stopping. Validation error needs to decrease at least
-        every <early_stopping_rounds> round(s) to continue training.
-        Requires at least one item in evals.
-        If there's more than one, will use the last.
-        Returns the model from the last iteration (not the best one).
-        If early stopping occurs, the model will have two additional fields:
-        bst.best_score and bst.best_iteration.
-    evals_result: dict
-        This dictionary stores the evaluation results of all the items in watchlist
-    verbose_eval : bool
-        If `verbose_eval` then the evaluation metric on the validation set, if
-        given, is printed at each boosting stage.
-
-    Returns
-    -------
-    booster : a trained booster model
-    """
-    evals = list(evals)
-    bst = Booster(params, [dtrain] + [d[0] for d in evals])
-
-    if evals_result is not None:
-        if not isinstance(evals_result, dict):
-            raise TypeError('evals_result has to be a dictionary')
-        else:
-            evals_name = [d[1] for d in evals]
-            evals_result.clear()
-            evals_result.update({key: [] for key in evals_name})
-
-    if not early_stopping_rounds:
-        for i in range(num_boost_round):
-            bst.update(dtrain, i, obj)
-            if len(evals) != 0:
-                bst_eval_set = bst.eval_set(evals, i, feval)
-                if isinstance(bst_eval_set, STRING_TYPES):
-                    msg = bst_eval_set
-                else:
-                    msg = bst_eval_set.decode()
-
-                if verbose_eval:
-                    sys.stderr.write(msg + '\n')
-                if evals_result is not None:
-                    res = re.findall(":-?([0-9.]+).", msg)
-                    for key, val in zip(evals_name, res):
-                        evals_result[key].append(val)
-        return bst
-
-    else:
-        # early stopping
-        if len(evals) < 1:
-            raise ValueError('For early stopping you need at least one set in evals.')
-
-        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
-                evals[-1][1], early_stopping_rounds))
-
-        # is params a list of tuples? are we using multiple eval metrics?
-        if isinstance(params, list):
-            if len(params) != len(dict(params).items()):
-                raise ValueError('Check your params.'\
-                                     'Early stopping works with single eval metric only.')
-            params = dict(params)
-
-        # either minimize loss or maximize AUC/MAP/NDCG
-        maximize_score = False
-        if 'eval_metric' in params:
-            maximize_metrics = ('auc', 'map', 'ndcg')
-            if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
-                maximize_score = True
-
-        if maximize_score:
-            best_score = 0.0
-        else:
-            best_score = float('inf')
-
-        best_msg = ''
-        best_score_i = 0
-
-        for i in range(num_boost_round):
-            bst.update(dtrain, i, obj)
-            bst_eval_set = bst.eval_set(evals, i, feval)
-
-            if isinstance(bst_eval_set, STRING_TYPES):
-                msg = bst_eval_set
-            else:
-                msg = bst_eval_set.decode()
-
-            if verbose_eval:
-                sys.stderr.write(msg + '\n')
-
-            if evals_result is not None:
-                res = re.findall(":-([0-9.]+).", msg)
-                for key, val in zip(evals_name, res):
-                    evals_result[key].append(val)
-
-            score = float(msg.rsplit(':', 1)[1])
-            if (maximize_score and score > best_score) or \
-                    (not maximize_score and score < best_score):
-                best_score = score
-                best_score_i = i
-                best_msg = msg
-            elif i - best_score_i >= early_stopping_rounds:
-                sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
-                bst.best_score = best_score
-                bst.best_iteration = best_score_i
-                break
-        bst.best_score = best_score
-        bst.best_iteration = best_score_i
-        return bst
-
-
-class CVPack(object):
-    """"Auxiliary datastruct to hold one fold of CV."""
-    def __init__(self, dtrain, dtest, param):
-        """"Initialize the CVPack"""
-        self.dtrain = dtrain
-        self.dtest = dtest
-        self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
-        self.bst = Booster(param, [dtrain, dtest])
-
-    def update(self, iteration, fobj):
-        """"Update the boosters for one iteration"""
-        self.bst.update(self.dtrain, iteration, fobj)
-
-    def eval(self, iteration, feval):
-        """"Evaluate the CVPack for one iteration."""
-        return self.bst.eval_set(self.watchlist, iteration, feval)
-
-
-def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
-    """
-    Make an n-fold list of CVPack from random indices.
-    """
-    evals = list(evals)
-    np.random.seed(seed)
-    randidx = np.random.permutation(dall.num_row())
-    kstep = len(randidx) / nfold
-    idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
-    ret = []
-    for k in range(nfold):
-        dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
-        dtest = dall.slice(idset[k])
-        # run preprocessing on the data set if needed
-        if fpreproc is not None:
-            dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
-        else:
-            tparam = param
-        plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
-        ret.append(CVPack(dtrain, dtest, plst))
-    return ret
-
-
-def aggcv(rlist, show_stdv=True):
-    # pylint: disable=invalid-name
-    """
-    Aggregate cross-validation results.
-    """
-    cvmap = {}
-    ret = rlist[0].split()[0]
-    for line in rlist:
-        arr = line.split()
-        assert ret == arr[0]
-        for it in arr[1:]:
-            if not isinstance(it, STRING_TYPES):
-                it = it.decode()
-            k, v = it.split(':')
-            if k not in cvmap:
-                cvmap[k] = []
-            cvmap[k].append(float(v))
-    for k, v in sorted(cvmap.items(), key=lambda x: x[0]):
-        v = np.array(v)
-        if not isinstance(ret, STRING_TYPES):
-            ret = ret.decode()
-        if show_stdv:
-            ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
-        else:
-            ret += '\tcv-%s:%f' % (k, np.mean(v))
-    return ret
-
-
-def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
-       obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0):
-    # pylint: disable = invalid-name
-    """Cross-validation with given paramaters.
-
-    Parameters
-    ----------
-    params : dict
-        Booster params.
-    dtrain : DMatrix
-        Data to be trained.
-    num_boost_round : int
-        Number of boosting iterations.
-    nfold : int
-        Number of folds in CV.
-    metrics : list of strings
-        Evaluation metrics to be watched in CV.
-    obj : function
-        Custom objective function.
-    feval : function
-        Custom evaluation function.
-    fpreproc : function
-        Preprocessing function that takes (dtrain, dtest, param) and returns
-        transformed versions of those.
-    show_stdv : bool
-        Whether to display the standard deviation.
-    seed : int
-        Seed used to generate the folds (passed to numpy.random.seed).
-
-    Returns
-    -------
-    evaluation history : list(string)
-    """
-    results = []
-    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
-    for i in range(num_boost_round):
-        for fold in cvfolds:
-            fold.update(i, obj)
-        res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
-        sys.stderr.write(res + '\n')
-        results.append(res)
-    return results
-
-
-# used for compatiblity without sklearn
-XGBModelBase = object
-XGBClassifierBase = object
-XGBRegressorBase = object
-if SKLEARN_INSTALLED:
-    XGBModelBase = BaseEstimator
-    XGBRegressorBase = RegressorMixin
-    XGBClassifierBase = ClassifierMixin
-
-class XGBModel(XGBModelBase):
-    # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name
-    """Implementation of the Scikit-Learn API for XGBoost.
-
-    Parameters
-    ----------
-    max_depth : int
-        Maximum tree depth for base learners.
-    learning_rate : float
-        Boosting learning rate (xgb's "eta")
-    n_estimators : int
-        Number of boosted trees to fit.
-    silent : boolean
-        Whether to print messages while running boosting.
-    objective : string
-        Specify the learning task and the corresponding learning objective.
-
-    nthread : int
-        Number of parallel threads used to run xgboost.
-    gamma : float
-        Minimum loss reduction required to make a further partition on a leaf node of the tree.
-    min_child_weight : int
-        Minimum sum of instance weight(hessian) needed in a child.
-    max_delta_step : int
-        Maximum delta step we allow each tree's weight estimation to be.
-    subsample : float
-        Subsample ratio of the training instance.
-    colsample_bytree : float
-        Subsample ratio of columns when constructing each tree.
-
-    base_score:
-        The initial prediction score of all instances, global bias.
-    seed : int
-        Random number seed.
-    missing : float, optional
-        Value in the data which needs to be present as a missing value. If
-        None, defaults to np.nan.
-    """
-    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
-                 silent=True, objective="reg:linear",
-                 nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0,
-                 subsample=1, colsample_bytree=1,
-                 base_score=0.5, seed=0, missing=None):
-        if not SKLEARN_INSTALLED:
-            raise XGBoostError('sklearn needs to be installed in order to use this module')
-        self.max_depth = max_depth
-        self.learning_rate = learning_rate
-        self.n_estimators = n_estimators
-        self.silent = silent
-        self.objective = objective
-
-        self.nthread = nthread
-        self.gamma = gamma
-        self.min_child_weight = min_child_weight
-        self.max_delta_step = max_delta_step
-        self.subsample = subsample
-        self.colsample_bytree = colsample_bytree
-
-        self.base_score = base_score
-        self.seed = seed
-        self.missing = missing if missing is not None else np.nan
-        self._Booster = None
-
-    def __setstate__(self, state):
-        # backward compatiblity code
-        # load booster from raw if it is raw
-        # the booster now support pickle
-        bst = state["_Booster"]
-        if bst is not None and not isinstance(bst, Booster):
-            state["_Booster"] = Booster(model_file=bst)
-        self.__dict__.update(state)
-
-    def booster(self):
-        """Get the underlying xgboost Booster of this model.
-
-        This will raise an exception when fit was not called
-
-        Returns
-        -------
-        booster : a xgboost booster of underlying model
-        """
-        if self._Booster is None:
-            raise XGBoostError('need to call fit beforehand')
-        return self._Booster
-
-    def get_params(self, deep=False):
-        """Get parameter.s"""
-        params = super(XGBModel, self).get_params(deep=deep)
-        if params['missing'] is np.nan:
-            params['missing'] = None  # sklearn doesn't handle nan. see #4725
-        if not params.get('eval_metric', True):
-            del params['eval_metric']  # don't give as None param to Booster
-        return params
-
-    def get_xgb_params(self):
-        """Get xgboost type parameters."""
-        xgb_params = self.get_params()
-
-        xgb_params['silent'] = 1 if self.silent else 0
-
-        if self.nthread <= 0:
-            xgb_params.pop('nthread', None)
-        return xgb_params
-
-    def fit(self, X, y, eval_set=None, eval_metric=None,
-            early_stopping_rounds=None, verbose=True):
-        # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init
-        """
-        Fit the gradient boosting model
-
-        Parameters
-        ----------
-        X : array_like
-            Feature matrix
-        y : array_like
-            Labels
-        eval_set : list, optional
-            A list of (X, y) tuple pairs to use as a validation set for
-            early-stopping
-        eval_metric : str, callable, optional
-            If a str, should be a built-in evaluation metric to use. See
-            doc/parameter.md. If callable, a custom evaluation metric. The call
-            signature is func(y_predicted, y_true) where y_true will be a
-            DMatrix object such that you may need to call the get_label
-            method. It must return a str, value pair where the str is a name
-            for the evaluation and value is the value of the evaluation
-            function. This objective is always minimized.
-        early_stopping_rounds : int
-            Activates early stopping. Validation error needs to decrease at
-            least every <early_stopping_rounds> round(s) to continue training.
-            Requires at least one item in evals.  If there's more than one,
-            will use the last. Returns the model from the last iteration
-            (not the best one). If early stopping occurs, the model will
-            have two additional fields: bst.best_score and bst.best_iteration.
-        verbose : bool
-            If `verbose` and an evaluation set is used, writes the evaluation
-            metric measured on the validation set to stderr.
-        """
-        trainDmatrix = DMatrix(X, label=y, missing=self.missing)
-
-        eval_results = {}
-        if eval_set is not None:
-            evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
-            evals = list(zip(evals, ["validation_{}".format(i) for i in
-                                     range(len(evals))]))
-        else:
-            evals = ()
-
-        params = self.get_xgb_params()
-
-        feval = eval_metric if callable(eval_metric) else None
-        if eval_metric is not None:
-            if callable(eval_metric):
-                eval_metric = None
-            else:
-                params.update({'eval_metric': eval_metric})
-
-        self._Booster = train(params, trainDmatrix,
-                              self.n_estimators, evals=evals,
-                              early_stopping_rounds=early_stopping_rounds,
-                              evals_result=eval_results, feval=feval,
-                              verbose_eval=verbose)
-        if eval_results:
-            eval_results = {k: np.array(v, dtype=float)
-                            for k, v in eval_results.items()}
-            eval_results = {k: np.array(v) for k, v in eval_results.items()}
-            self.eval_results = eval_results
-
-        if early_stopping_rounds is not None:
-            self.best_score = self._Booster.best_score
-            self.best_iteration = self._Booster.best_iteration
-        return self
-
-    def predict(self, data):
-        # pylint: disable=missing-docstring,invalid-name
-        test_dmatrix = DMatrix(data, missing=self.missing)
-        return self.booster().predict(test_dmatrix)
-
-
-class XGBClassifier(XGBModel, XGBClassifierBase):
-    # pylint: disable=missing-docstring,too-many-arguments,invalid-name
-    __doc__ = """
-    Implementation of the scikit-learn API for XGBoost classification
-    """ + "\n".join(XGBModel.__doc__.split('\n')[2:])
-
-    def __init__(self, max_depth=3, learning_rate=0.1,
-                 n_estimators=100, silent=True,
-                 objective="binary:logistic",
-                 nthread=-1, gamma=0, min_child_weight=1,
-                 max_delta_step=0, subsample=1, colsample_bytree=1,
-                 base_score=0.5, seed=0, missing=None):
-        super(XGBClassifier, self).__init__(max_depth, learning_rate,
-                                            n_estimators, silent, objective,
-                                            nthread, gamma, min_child_weight,
-                                            max_delta_step, subsample,
-                                            colsample_bytree,
-                                            base_score, seed, missing)
-
-    def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
-            early_stopping_rounds=None, verbose=True):
-        # pylint: disable = attribute-defined-outside-init,arguments-differ
-        """
-        Fit gradient boosting classifier
-
-        Parameters
-        ----------
-        X : array_like
-            Feature matrix
-        y : array_like
-            Labels
-        sample_weight : array_like
-            Weight for each instance
-        eval_set : list, optional
-            A list of (X, y) pairs to use as a validation set for
-            early-stopping
-        eval_metric : str, callable, optional
-            If a str, should be a built-in evaluation metric to use. See
-            doc/parameter.md. If callable, a custom evaluation metric. The call
-            signature is func(y_predicted, y_true) where y_true will be a
-            DMatrix object such that you may need to call the get_label
-            method. It must return a str, value pair where the str is a name
-            for the evaluation and value is the value of the evaluation
-            function. This objective is always minimized.
-        early_stopping_rounds : int, optional
-            Activates early stopping. Validation error needs to decrease at
-            least every <early_stopping_rounds> round(s) to continue training.
-            Requires at least one item in evals.  If there's more than one,
-            will use the last. Returns the model from the last iteration
-            (not the best one). If early stopping occurs, the model will
-            have two additional fields: bst.best_score and bst.best_iteration.
-        verbose : bool
-            If `verbose` and an evaluation set is used, writes the evaluation
-            metric measured on the validation set to stderr.
-        """
-        eval_results = {}
-        self.classes_ = list(np.unique(y))
-        self.n_classes_ = len(self.classes_)
-        if self.n_classes_ > 2:
-            # Switch to using a multiclass objective in the underlying XGB instance
-            self.objective = "multi:softprob"
-            xgb_options = self.get_xgb_params()
-            xgb_options['num_class'] = self.n_classes_
-        else:
-            xgb_options = self.get_xgb_params()
-
-        feval = eval_metric if callable(eval_metric) else None
-        if eval_metric is not None:
-            if callable(eval_metric):
-                eval_metric = None
-            else:
-                xgb_options.update({"eval_metric": eval_metric})
-
-        if eval_set is not None:
-            # TODO: use sample_weight if given?
-            evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
-            nevals = len(evals)
-            eval_names = ["validation_{}".format(i) for i in range(nevals)]
-            evals = list(zip(evals, eval_names))
-        else:
-            evals = ()
-
-        self._le = LabelEncoder().fit(y)
-        training_labels = self._le.transform(y)
-
-        if sample_weight is not None:
-            train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
-                                    missing=self.missing)
-        else:
-            train_dmatrix = DMatrix(X, label=training_labels,
-                                    missing=self.missing)
-
-        self._Booster = train(xgb_options, train_dmatrix, self.n_estimators,
-                              evals=evals,
-                              early_stopping_rounds=early_stopping_rounds,
-                              evals_result=eval_results, feval=feval,
-                              verbose_eval=verbose)
-
-        if eval_results:
-            eval_results = {k: np.array(v, dtype=float)
-                            for k, v in eval_results.items()}
-            self.eval_results = eval_results
-
-        if early_stopping_rounds is not None:
-            self.best_score = self._Booster.best_score
-            self.best_iteration = self._Booster.best_iteration
-
-        return self
-
-    def predict(self, data):
-        test_dmatrix = DMatrix(data, missing=self.missing)
-        class_probs = self.booster().predict(test_dmatrix)
-        if len(class_probs.shape) > 1:
-            column_indexes = np.argmax(class_probs, axis=1)
-        else:
-            column_indexes = np.repeat(0, data.shape[0])
-            column_indexes[class_probs > 0.5] = 1
-        return self._le.inverse_transform(column_indexes)
-
-    def predict_proba(self, data):
-        test_dmatrix = DMatrix(data, missing=self.missing)
-        class_probs = self.booster().predict(test_dmatrix)
-        if self.objective == "multi:softprob":
-            return class_probs
-        else:
-            classone_probs = class_probs
-            classzero_probs = 1.0 - classone_probs
-            return np.vstack((classzero_probs, classone_probs)).transpose()
-
-class XGBRegressor(XGBModel, XGBRegressorBase):
-    # pylint: disable=missing-docstring
-    __doc__ = """
-    Implementation of the scikit-learn API for XGBoost regression
-    """ + "\n".join(XGBModel.__doc__.split('\n')[2:])
diff --git a/windows/README.md b/windows/README.md
index cb1cc9dd9..564c97d25 100644
--- a/windows/README.md
+++ b/windows/README.md
@@ -11,7 +11,7 @@ This should give you xgboost.exe for CLI version and xgboost_wrapper.dll for pyt
 
 Use Python Module
 =====
-* After you build the dll, you can install the Python package from the [../wrapper](../wrapper) folder
+* After you build the dll, you can install the Python package from the [../python-package](../python-package) folder
 
 ```
 python setup.py install
diff --git a/wrapper/README.md b/wrapper/README.md
index c5368bd7d..77316e15c 100644
--- a/wrapper/README.md
+++ b/wrapper/README.md
@@ -1,20 +1,9 @@
-Wrapper of XGBoost
-=====
-This folder provides wrapper of xgboost to other languages
+XGBoost Wrappers
+================
+This folder provides wrapper to create xgboost packages to other languages.
 
-Python
-=====
-* To make the python module, type ```./build.sh``` in the root directory of project
-* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools) 
-* Install with `python setup.py install` from this directory.
-* Refer also to the walk through example in [demo folder](../demo/guide-python)
-* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo.
-
-
-R
-=====
-* See [R-package](../R-package)
-
-Julia
-=====
-* See [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl)
+***Supported Language Packages***
+* [Python package](../python-package)
+* [R-package](../R-package)
+* [Java Package](../java)
+* [Julia Package](https://github.com/antinucleon/XGBoost.jl)
diff --git a/wrapper/__init__.py b/wrapper/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/wrapper/setup.py b/wrapper/setup.py
deleted file mode 100644
index 5365d61b0..000000000
--- a/wrapper/setup.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# pylint: disable=invalid-name
-"""Setup xgboost package."""
-import os
-import platform
-from setuptools import setup
-
-
-class XGBoostLibraryNotFound(Exception):
-    """Exception to raise when xgboost library cannot be found."""
-    pass
-
-
-curr_dir = os.path.dirname(os.path.abspath(__file__))
-dll_path = [curr_dir]
-
-if os.name == 'nt':
-    if platform.architecture()[0] == '64bit':
-        dll_path.append(os.path.join(curr_dir, '../windows/x64/Release/'))
-    else:
-        dll_path.append(os.path.join(curr_dir, '../windows/Release/'))
-
-
-if os.name == 'nt':
-    dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
-else:
-    dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
-
-lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
-
-if len(lib_path) == 0:
-    raise XGBoostLibraryNotFound("XGBoost library not found. Did you run "
-                                 "../make?")
-setup(name="xgboost",
-      version="0.40",
-      description="Python wrappers for XGBoost: eXtreme Gradient Boosting",
-      zip_safe=False,
-      py_modules=['xgboost'],
-      data_files=[('.', [lib_path[0]])],
-      url="https://github.com/dmlc/xgboost")