diff --git a/.gitignore b/.gitignore
index 73ae6748e..048803abd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,10 +48,9 @@ Debug
 *.cpage.col
 *.cpage
 *.Rproj
-xgboost
-xgboost.mpi
-xgboost.mock
-train*
+./xgboost
+./xgboost.mpi
+./xgboost.mock
 rabit
 #.Rbuildignore
 R-package.Rproj
diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py
new file mode 100644
index 000000000..6f967b837
--- /dev/null
+++ b/python-package/xgboost/__init__.py
@@ -0,0 +1,12 @@
+# coding: utf-8
+"""XGBoost: eXtreme Gradient Boosting library.
+
+Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md
+"""
+
+from __future__ import absolute_import
+from .core import DMatrix, Booster
+from .training import train, cv
+from .sklearn import XGBModel, XGBClassifier, XGBRegressor
+
+__version__ = '0.4'
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
new file mode 100644
index 000000000..4a5771724
--- /dev/null
+++ b/python-package/xgboost/sklearn.py
@@ -0,0 +1,341 @@
+# coding: utf-8
+# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
+"""Scikit-Learn Wrapper interface for XGBoost."""
+from __future__ import absolute_import
+
+import numpy as np
+from .core import Booster, DMatrix, XGBoostError
+from .training import train
+
+try:
+    from sklearn.base import BaseEstimator
+    from sklearn.base import RegressorMixin, ClassifierMixin
+    from sklearn.preprocessing import LabelEncoder
+    SKLEARN_INSTALLED = True
+except ImportError:
+    SKLEARN_INSTALLED = False
+
+# used for compatiblity without sklearn
+XGBModelBase = object
+XGBClassifierBase = object
+XGBRegressorBase = object
+
+if SKLEARN_INSTALLED:
+    XGBModelBase = BaseEstimator
+    XGBRegressorBase = RegressorMixin
+    XGBClassifierBase = ClassifierMixin
+
+class XGBModel(XGBModelBase):
+    # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name
+    """Implementation of the Scikit-Learn API for XGBoost.
+
+    Parameters
+    ----------
+    max_depth : int
+        Maximum tree depth for base learners.
+    learning_rate : float
+        Boosting learning rate (xgb's "eta")
+    n_estimators : int
+        Number of boosted trees to fit.
+    silent : boolean
+        Whether to print messages while running boosting.
+    objective : string
+        Specify the learning task and the corresponding learning objective.
+
+    nthread : int
+        Number of parallel threads used to run xgboost.
+    gamma : float
+        Minimum loss reduction required to make a further partition on a leaf node of the tree.
+    min_child_weight : int
+        Minimum sum of instance weight(hessian) needed in a child.
+    max_delta_step : int
+        Maximum delta step we allow each tree's weight estimation to be.
+    subsample : float
+        Subsample ratio of the training instance.
+    colsample_bytree : float
+        Subsample ratio of columns when constructing each tree.
+
+    base_score:
+        The initial prediction score of all instances, global bias.
+    seed : int
+        Random number seed.
+    missing : float, optional
+        Value in the data which needs to be present as a missing value. If
+        None, defaults to np.nan.
+    """
+    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
+                 silent=True, objective="reg:linear",
+                 nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0,
+                 subsample=1, colsample_bytree=1,
+                 base_score=0.5, seed=0, missing=None):
+        if not SKLEARN_INSTALLED:
+            raise XGBoostError('sklearn needs to be installed in order to use this module')
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
+        self.n_estimators = n_estimators
+        self.silent = silent
+        self.objective = objective
+
+        self.nthread = nthread
+        self.gamma = gamma
+        self.min_child_weight = min_child_weight
+        self.max_delta_step = max_delta_step
+        self.subsample = subsample
+        self.colsample_bytree = colsample_bytree
+
+        self.base_score = base_score
+        self.seed = seed
+        self.missing = missing if missing is not None else np.nan
+        self._Booster = None
+
+    def __setstate__(self, state):
+        # backward compatiblity code
+        # load booster from raw if it is raw
+        # the booster now support pickle
+        bst = state["_Booster"]
+        if bst is not None and not isinstance(bst, Booster):
+            state["_Booster"] = Booster(model_file=bst)
+        self.__dict__.update(state)
+
+    def booster(self):
+        """Get the underlying xgboost Booster of this model.
+
+        This will raise an exception when fit was not called
+
+        Returns
+        -------
+        booster : a xgboost booster of underlying model
+        """
+        if self._Booster is None:
+            raise XGBoostError('need to call fit beforehand')
+        return self._Booster
+
+    def get_params(self, deep=False):
+        """Get parameter.s"""
+        params = super(XGBModel, self).get_params(deep=deep)
+        if params['missing'] is np.nan:
+            params['missing'] = None  # sklearn doesn't handle nan. see #4725
+        if not params.get('eval_metric', True):
+            del params['eval_metric']  # don't give as None param to Booster
+        return params
+
+    def get_xgb_params(self):
+        """Get xgboost type parameters."""
+        xgb_params = self.get_params()
+
+        xgb_params['silent'] = 1 if self.silent else 0
+
+        if self.nthread <= 0:
+            xgb_params.pop('nthread', None)
+        return xgb_params
+
+    def fit(self, X, y, eval_set=None, eval_metric=None,
+            early_stopping_rounds=None, verbose=True):
+        # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init
+        """
+        Fit the gradient boosting model
+
+        Parameters
+        ----------
+        X : array_like
+            Feature matrix
+        y : array_like
+            Labels
+        eval_set : list, optional
+            A list of (X, y) tuple pairs to use as a validation set for
+            early-stopping
+        eval_metric : str, callable, optional
+            If a str, should be a built-in evaluation metric to use. See
+            doc/parameter.md. If callable, a custom evaluation metric. The call
+            signature is func(y_predicted, y_true) where y_true will be a
+            DMatrix object such that you may need to call the get_label
+            method. It must return a str, value pair where the str is a name
+            for the evaluation and value is the value of the evaluation
+            function. This objective is always minimized.
+        early_stopping_rounds : int
+            Activates early stopping. Validation error needs to decrease at
+            least every <early_stopping_rounds> round(s) to continue training.
+            Requires at least one item in evals.  If there's more than one,
+            will use the last. Returns the model from the last iteration
+            (not the best one). If early stopping occurs, the model will
+            have two additional fields: bst.best_score and bst.best_iteration.
+        verbose : bool
+            If `verbose` and an evaluation set is used, writes the evaluation
+            metric measured on the validation set to stderr.
+        """
+        trainDmatrix = DMatrix(X, label=y, missing=self.missing)
+
+        eval_results = {}
+        if eval_set is not None:
+            evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
+            evals = list(zip(evals, ["validation_{}".format(i) for i in
+                                     range(len(evals))]))
+        else:
+            evals = ()
+
+        params = self.get_xgb_params()
+
+        feval = eval_metric if callable(eval_metric) else None
+        if eval_metric is not None:
+            if callable(eval_metric):
+                eval_metric = None
+            else:
+                params.update({'eval_metric': eval_metric})
+
+        self._Booster = train(params, trainDmatrix,
+                              self.n_estimators, evals=evals,
+                              early_stopping_rounds=early_stopping_rounds,
+                              evals_result=eval_results, feval=feval,
+                              verbose_eval=verbose)
+        if eval_results:
+            eval_results = {k: np.array(v, dtype=float)
+                            for k, v in eval_results.items()}
+            eval_results = {k: np.array(v) for k, v in eval_results.items()}
+            self.eval_results = eval_results
+
+        if early_stopping_rounds is not None:
+            self.best_score = self._Booster.best_score
+            self.best_iteration = self._Booster.best_iteration
+        return self
+
+    def predict(self, data):
+        # pylint: disable=missing-docstring,invalid-name
+        test_dmatrix = DMatrix(data, missing=self.missing)
+        return self.booster().predict(test_dmatrix)
+
+
+class XGBClassifier(XGBModel, XGBClassifierBase):
+    # pylint: disable=missing-docstring,too-many-arguments,invalid-name
+    __doc__ = """
+    Implementation of the scikit-learn API for XGBoost classification
+    """ + "\n".join(XGBModel.__doc__.split('\n')[2:])
+
+    def __init__(self, max_depth=3, learning_rate=0.1,
+                 n_estimators=100, silent=True,
+                 objective="binary:logistic",
+                 nthread=-1, gamma=0, min_child_weight=1,
+                 max_delta_step=0, subsample=1, colsample_bytree=1,
+                 base_score=0.5, seed=0, missing=None):
+        super(XGBClassifier, self).__init__(max_depth, learning_rate,
+                                            n_estimators, silent, objective,
+                                            nthread, gamma, min_child_weight,
+                                            max_delta_step, subsample,
+                                            colsample_bytree,
+                                            base_score, seed, missing)
+
+    def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
+            early_stopping_rounds=None, verbose=True):
+        # pylint: disable = attribute-defined-outside-init,arguments-differ
+        """
+        Fit gradient boosting classifier
+
+        Parameters
+        ----------
+        X : array_like
+            Feature matrix
+        y : array_like
+            Labels
+        sample_weight : array_like
+            Weight for each instance
+        eval_set : list, optional
+            A list of (X, y) pairs to use as a validation set for
+            early-stopping
+        eval_metric : str, callable, optional
+            If a str, should be a built-in evaluation metric to use. See
+            doc/parameter.md. If callable, a custom evaluation metric. The call
+            signature is func(y_predicted, y_true) where y_true will be a
+            DMatrix object such that you may need to call the get_label
+            method. It must return a str, value pair where the str is a name
+            for the evaluation and value is the value of the evaluation
+            function. This objective is always minimized.
+        early_stopping_rounds : int, optional
+            Activates early stopping. Validation error needs to decrease at
+            least every <early_stopping_rounds> round(s) to continue training.
+            Requires at least one item in evals.  If there's more than one,
+            will use the last. Returns the model from the last iteration
+            (not the best one). If early stopping occurs, the model will
+            have two additional fields: bst.best_score and bst.best_iteration.
+        verbose : bool
+            If `verbose` and an evaluation set is used, writes the evaluation
+            metric measured on the validation set to stderr.
+        """
+        eval_results = {}
+        self.classes_ = list(np.unique(y))
+        self.n_classes_ = len(self.classes_)
+        if self.n_classes_ > 2:
+            # Switch to using a multiclass objective in the underlying XGB instance
+            self.objective = "multi:softprob"
+            xgb_options = self.get_xgb_params()
+            xgb_options['num_class'] = self.n_classes_
+        else:
+            xgb_options = self.get_xgb_params()
+
+        feval = eval_metric if callable(eval_metric) else None
+        if eval_metric is not None:
+            if callable(eval_metric):
+                eval_metric = None
+            else:
+                xgb_options.update({"eval_metric": eval_metric})
+
+        if eval_set is not None:
+            # TODO: use sample_weight if given?
+            evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
+            nevals = len(evals)
+            eval_names = ["validation_{}".format(i) for i in range(nevals)]
+            evals = list(zip(evals, eval_names))
+        else:
+            evals = ()
+
+        self._le = LabelEncoder().fit(y)
+        training_labels = self._le.transform(y)
+
+        if sample_weight is not None:
+            train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
+                                    missing=self.missing)
+        else:
+            train_dmatrix = DMatrix(X, label=training_labels,
+                                    missing=self.missing)
+
+        self._Booster = train(xgb_options, train_dmatrix, self.n_estimators,
+                              evals=evals,
+                              early_stopping_rounds=early_stopping_rounds,
+                              evals_result=eval_results, feval=feval,
+                              verbose_eval=verbose)
+
+        if eval_results:
+            eval_results = {k: np.array(v, dtype=float)
+                            for k, v in eval_results.items()}
+            self.eval_results = eval_results
+
+        if early_stopping_rounds is not None:
+            self.best_score = self._Booster.best_score
+            self.best_iteration = self._Booster.best_iteration
+
+        return self
+
+    def predict(self, data):
+        test_dmatrix = DMatrix(data, missing=self.missing)
+        class_probs = self.booster().predict(test_dmatrix)
+        if len(class_probs.shape) > 1:
+            column_indexes = np.argmax(class_probs, axis=1)
+        else:
+            column_indexes = np.repeat(0, data.shape[0])
+            column_indexes[class_probs > 0.5] = 1
+        return self._le.inverse_transform(column_indexes)
+
+    def predict_proba(self, data):
+        test_dmatrix = DMatrix(data, missing=self.missing)
+        class_probs = self.booster().predict(test_dmatrix)
+        if self.objective == "multi:softprob":
+            return class_probs
+        else:
+            classone_probs = class_probs
+            classzero_probs = 1.0 - classone_probs
+            return np.vstack((classzero_probs, classone_probs)).transpose()
+
+class XGBRegressor(XGBModel, XGBRegressorBase):
+    # pylint: disable=missing-docstring
+    __doc__ = """
+    Implementation of the scikit-learn API for XGBoost regression
+    """ + "\n".join(XGBModel.__doc__.split('\n')[2:])
+
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
new file mode 100644
index 000000000..1f2d722ac
--- /dev/null
+++ b/python-package/xgboost/training.py
@@ -0,0 +1,252 @@
+# coding: utf-8
+# pylint: disable=too-many-locals, too-many-arguments, invalid-name
+"""Training Library containing training routines."""
+from __future__ import absolute_import
+
+import sys
+import re
+import numpy as np
+from .core import Booster, STRING_TYPES
+
+def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
+          early_stopping_rounds=None, evals_result=None, verbose_eval=True):
+    # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
+    """Train a booster with given parameters.
+
+    Parameters
+    ----------
+    params : dict
+        Booster params.
+    dtrain : DMatrix
+        Data to be trained.
+    num_boost_round: int
+        Number of boosting iterations.
+    watchlist (evals): list of pairs (DMatrix, string)
+        List of items to be evaluated during training, this allows user to watch
+        performance on the validation set.
+    obj : function
+        Customized objective function.
+    feval : function
+        Customized evaluation function.
+    early_stopping_rounds: int
+        Activates early stopping. Validation error needs to decrease at least
+        every <early_stopping_rounds> round(s) to continue training.
+        Requires at least one item in evals.
+        If there's more than one, will use the last.
+        Returns the model from the last iteration (not the best one).
+        If early stopping occurs, the model will have two additional fields:
+        bst.best_score and bst.best_iteration.
+    evals_result: dict
+        This dictionary stores the evaluation results of all the items in watchlist
+    verbose_eval : bool
+        If `verbose_eval` then the evaluation metric on the validation set, if
+        given, is printed at each boosting stage.
+
+    Returns
+    -------
+    booster : a trained booster model
+    """
+    evals = list(evals)
+    bst = Booster(params, [dtrain] + [d[0] for d in evals])
+
+    if evals_result is not None:
+        if not isinstance(evals_result, dict):
+            raise TypeError('evals_result has to be a dictionary')
+        else:
+            evals_name = [d[1] for d in evals]
+            evals_result.clear()
+            evals_result.update({key: [] for key in evals_name})
+
+    if not early_stopping_rounds:
+        for i in range(num_boost_round):
+            bst.update(dtrain, i, obj)
+            if len(evals) != 0:
+                bst_eval_set = bst.eval_set(evals, i, feval)
+                if isinstance(bst_eval_set, STRING_TYPES):
+                    msg = bst_eval_set
+                else:
+                    msg = bst_eval_set.decode()
+
+                if verbose_eval:
+                    sys.stderr.write(msg + '\n')
+                if evals_result is not None:
+                    res = re.findall(":-?([0-9.]+).", msg)
+                    for key, val in zip(evals_name, res):
+                        evals_result[key].append(val)
+        return bst
+
+    else:
+        # early stopping
+        if len(evals) < 1:
+            raise ValueError('For early stopping you need at least one set in evals.')
+
+        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
+                evals[-1][1], early_stopping_rounds))
+
+        # is params a list of tuples? are we using multiple eval metrics?
+        if isinstance(params, list):
+            if len(params) != len(dict(params).items()):
+                raise ValueError('Check your params.'\
+                                     'Early stopping works with single eval metric only.')
+            params = dict(params)
+
+        # either minimize loss or maximize AUC/MAP/NDCG
+        maximize_score = False
+        if 'eval_metric' in params:
+            maximize_metrics = ('auc', 'map', 'ndcg')
+            if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
+                maximize_score = True
+
+        if maximize_score:
+            best_score = 0.0
+        else:
+            best_score = float('inf')
+
+        best_msg = ''
+        best_score_i = 0
+
+        for i in range(num_boost_round):
+            bst.update(dtrain, i, obj)
+            bst_eval_set = bst.eval_set(evals, i, feval)
+
+            if isinstance(bst_eval_set, STRING_TYPES):
+                msg = bst_eval_set
+            else:
+                msg = bst_eval_set.decode()
+
+            if verbose_eval:
+                sys.stderr.write(msg + '\n')
+
+            if evals_result is not None:
+                res = re.findall(":-([0-9.]+).", msg)
+                for key, val in zip(evals_name, res):
+                    evals_result[key].append(val)
+
+            score = float(msg.rsplit(':', 1)[1])
+            if (maximize_score and score > best_score) or \
+                    (not maximize_score and score < best_score):
+                best_score = score
+                best_score_i = i
+                best_msg = msg
+            elif i - best_score_i >= early_stopping_rounds:
+                sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
+                bst.best_score = best_score
+                bst.best_iteration = best_score_i
+                break
+        bst.best_score = best_score
+        bst.best_iteration = best_score_i
+        return bst
+
+
+class CVPack(object):
+    """"Auxiliary datastruct to hold one fold of CV."""
+    def __init__(self, dtrain, dtest, param):
+        """"Initialize the CVPack"""
+        self.dtrain = dtrain
+        self.dtest = dtest
+        self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
+        self.bst = Booster(param, [dtrain, dtest])
+
+    def update(self, iteration, fobj):
+        """"Update the boosters for one iteration"""
+        self.bst.update(self.dtrain, iteration, fobj)
+
+    def eval(self, iteration, feval):
+        """"Evaluate the CVPack for one iteration."""
+        return self.bst.eval_set(self.watchlist, iteration, feval)
+
+
+def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
+    """
+    Make an n-fold list of CVPack from random indices.
+    """
+    evals = list(evals)
+    np.random.seed(seed)
+    randidx = np.random.permutation(dall.num_row())
+    kstep = len(randidx) / nfold
+    idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
+    ret = []
+    for k in range(nfold):
+        dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
+        dtest = dall.slice(idset[k])
+        # run preprocessing on the data set if needed
+        if fpreproc is not None:
+            dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
+        else:
+            tparam = param
+        plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
+        ret.append(CVPack(dtrain, dtest, plst))
+    return ret
+
+
+def aggcv(rlist, show_stdv=True):
+    # pylint: disable=invalid-name
+    """
+    Aggregate cross-validation results.
+    """
+    cvmap = {}
+    ret = rlist[0].split()[0]
+    for line in rlist:
+        arr = line.split()
+        assert ret == arr[0]
+        for it in arr[1:]:
+            if not isinstance(it, STRING_TYPES):
+                it = it.decode()
+            k, v = it.split(':')
+            if k not in cvmap:
+                cvmap[k] = []
+            cvmap[k].append(float(v))
+    for k, v in sorted(cvmap.items(), key=lambda x: x[0]):
+        v = np.array(v)
+        if not isinstance(ret, STRING_TYPES):
+            ret = ret.decode()
+        if show_stdv:
+            ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
+        else:
+            ret += '\tcv-%s:%f' % (k, np.mean(v))
+    return ret
+
+
+def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
+       obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0):
+    # pylint: disable = invalid-name
+    """Cross-validation with given paramaters.
+
+    Parameters
+    ----------
+    params : dict
+        Booster params.
+    dtrain : DMatrix
+        Data to be trained.
+    num_boost_round : int
+        Number of boosting iterations.
+    nfold : int
+        Number of folds in CV.
+    metrics : list of strings
+        Evaluation metrics to be watched in CV.
+    obj : function
+        Custom objective function.
+    feval : function
+        Custom evaluation function.
+    fpreproc : function
+        Preprocessing function that takes (dtrain, dtest, param) and returns
+        transformed versions of those.
+    show_stdv : bool
+        Whether to display the standard deviation.
+    seed : int
+        Seed used to generate the folds (passed to numpy.random.seed).
+
+    Returns
+    -------
+    evaluation history : list(string)
+    """
+    results = []
+    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
+    for i in range(num_boost_round):
+        for fold in cvfolds:
+            fold.update(i, obj)
+        res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
+        sys.stderr.write(res + '\n')
+        results.append(res)
+    return results
+