Merge pull request #1 from dmlc/master

update from original
2015-09-16 01:16:31 -07:00
parent 752cf4c95d ae43fd7c7a
commit 16781ac8f9
58 changed files with 1214 additions and 75 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,6 +12,7 @@ env:
    - TASK=lint LINT_LANG=python
    - TASK=R-package CXX=g++
    - TASK=python-package CXX=g++
+    - TASK=python-package3 CXX=g++
    - TASK=java-package CXX=g++
    - TASK=build CXX=g++
    - TASK=build-with-dmlc CXX=g++
@@ -29,9 +30,6 @@ addons:
      - wget
      - libcurl4-openssl-dev
      - unzip
-      - python-numpy
-      - python-scipy
-      - python-nose

 before_install:
  - scripts/travis_osx_install.sh
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -40,6 +40,8 @@ on going at master
 * Fix List
  - Fixed possible problem of poisson regression for R.
 * Python module now throw exception instead of crash terminal when a parameter error happens.
+* Python module now has importance plot and tree plot functions.
 * Java api is ready for use
 * Added more test cases and continuous integration to make each build more robust
-* Improvements in sklearn compatible module
+* Improvements in sklearn compatible module
+* Added pip installation functionality for python module
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -34,6 +34,7 @@ List of Contributors
 * [Zygmunt Zając](https://github.com/zygmuntz)
  - Zygmunt is the master behind the early stopping feature frequently used by kagglers.
 * [Ajinkya Kale](https://github.com/ajkl)
+* [Yuan Tang](https://github.com/terrytangyuan)
 * [Boliang Chen](https://github.com/cblsjtu)
 * [Vadim Khotilovich](https://github.com/khotilov)
 * [Yangqing Men](https://github.com/yanqingmen)
@@ -44,3 +45,7 @@ List of Contributors
 * [Jamie Hall](https://github.com/nerdcha)
  - Jamie is the initial creator of xgboost sklearn modue.
 * [Yen-Ying Lee](https://github.com/white1033)
+* [Masaaki Horikoshi](https://github.com/sinhrks)
+  - Masaaki is the initial creator of xgboost python plotting module.
+* [Hongliang Liu](https://github.com/phunterlau)
+  - Hongliang is the maintainer of xgboost python PyPI package for pip installation.
--- a/35
+++ b/35
@@ -1,4 +1,5 @@
 export CC  = gcc
+#build on the fly
 export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm
@@ -20,9 +21,17 @@ endif
 ifeq ($(no_omp),1)
 	CFLAGS += -DDISABLE_OPENMP
 else
-	CFLAGS += -fopenmp
+	#CFLAGS += -fopenmp
+	ifeq ($(omp_mac_static),1)
+		#CFLAGS += -fopenmp -Bstatic
+		CFLAGS += -static-libgcc -static-libstdc++ -L. -fopenmp
+		#LDFLAGS += -Wl,--whole-archive -lpthread -Wl --no-whole-archive
+	else
+		CFLAGS += -fopenmp
+	endif
 endif

+
 # by default use c++11
 ifeq ($(cxx11),1)
 	CFLAGS += -std=c++11
@@ -167,6 +176,30 @@ Rcheck:
 	make Rbuild
 	R CMD check --as-cran xgboost*.tar.gz

+pythonpack:
+	#make clean
+	cd subtree/rabit;make clean;cd ..
+	rm -rf xgboost-deploy xgboost*.tar.gz
+	cp -r python-package xgboost-deploy
+	cp *.md xgboost-deploy/
+	cp LICENSE xgboost-deploy/
+	cp Makefile xgboost-deploy/xgboost
+	cp -r wrapper xgboost-deploy/xgboost
+	cp -r subtree xgboost-deploy/xgboost
+	cp -r multi-node xgboost-deploy/xgboost
+	cp -r windows xgboost-deploy/xgboost
+	cp -r src xgboost-deploy/xgboost
+
+	#make python
+
+pythonbuild:
+	make pythonpack
+	python setup.py install
+
+pythoncheck:
+	make pythonbuild
+	python -c 'import xgboost;print xgboost.core.find_lib_path()'
+
 # lint requires dmlc to be in current folder
 lint:
 	dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package python-package
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -23,7 +23,8 @@ Suggests:
    ggplot2 (>= 1.0.0),
    DiagrammeR (>= 0.6),
    Ckmeans.1d.dp (>= 3.3.1),
-    vcd (>= 1.3)
+    vcd (>= 1.3),
+    testthat
 Depends:
    R (>= 2.10)
 Imports:
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -103,17 +103,21 @@ xgb.Booster.check <- function(bst, saveraw = TRUE)
 ## ----the following are low level iteratively function, not needed if
 ## you do not want to use them ---------------------------------------
 # get dmatrix from data, label
-xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
+xgb.get.DMatrix <- function(data, label = NULL, missing = NULL, weight = NULL) {
  inClass <- class(data)
  if (inClass == "dgCMatrix" || inClass == "matrix") {
    if (is.null(label)) {
      stop("xgboost: need label when data is a matrix")
    }
+    dtrain <- xgb.DMatrix(data, label = label)
    if (is.null(missing)){
      dtrain <- xgb.DMatrix(data, label = label)
    } else {
      dtrain <- xgb.DMatrix(data, label = label, missing = missing)
    }
+    if (!is.null(weight)){
+      xgb.setinfo(dtrain, "weight", weight)
+    }
  } else {
    if (!is.null(label)) {
      warning("xgboost: label will be ignored.")
@@ -122,6 +126,9 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
      dtrain <- xgb.DMatrix(data)
    } else if (inClass == "xgb.DMatrix") {
      dtrain <- data
+    } else if (inClass == "data.frame") {
+      stop("xgboost only support numerical matrix input, 
+           use 'data.frame' to transform the data.")
    } else {
      stop("xgboost: Invalid input of data")
    }
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -72,6 +72,8 @@
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
+#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
+#' @param save_name the name or path for periodically saved model file.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -120,7 +122,8 @@
 #' 
 xgb.train <- function(params=list(), data, nrounds, watchlist = list(), 
                      obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
-                      early.stop.round = NULL, maximize = NULL, ...) {
+                      early.stop.round = NULL, maximize = NULL, 
+                      save_period = 0, save_name = "xgboost.model", ...) {
  dtrain <- data
  if (typeof(params) != "list") {
    stop("xgb.train: first argument params must be list")
@@ -215,6 +218,11 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
        }
      }
    }
+    if (save_period > 0) {
+      if (i %% save_period == 0) {
+        xgb.save(bst, save_name)
+      }
+    }
  }
  bst <- xgb.Booster.check(bst)
  if (!is.null(early.stop.round)) {
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -31,11 +31,14 @@
 #' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
 #' @param missing Missing is only used when input is dense matrix, pick a float 
 #'     value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
+#' @param weight a vector indicating the weight for each row of the input.
 #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
 #'     If set to an integer \code{k}, training with a validation set will stop if the performance 
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
+#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
+#' @param save_name the name or path for periodically saved model file.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -56,14 +59,11 @@
 #' 
 #' @export
 #' 
-xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, 
+xgboost <- function(data = NULL, label = NULL, missing = NULL, weight = NULL, 
+                    params = list(), nrounds, 
                    verbose = 1, print.every.n = 1L, early.stop.round = NULL,
-                    maximize = NULL, ...) {
-  if (is.null(missing)) {
-    dtrain <- xgb.get.DMatrix(data, label)
-  } else {
-    dtrain <- xgb.get.DMatrix(data, label, missing)
-  }
+                    maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) {
+  dtrain <- xgb.get.DMatrix(data, label, missing, weight)
    
  params <- append(params, list(...))
  
@@ -74,7 +74,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
  }
  
  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
-                   early.stop.round = early.stop.round)
+                   early.stop.round = early.stop.round, maximize = maximize,
+                   save_period = save_period, save_name = save_name)
  
  return(bst)
 } 
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -1,4 +1,5 @@
 basic_walkthrough               Basic feature walkthrough
+caret_wrapper                   Use xgboost to train in caret library
 custom_objective                Cutomize loss function, and evaluation metric
 boost_from_prediction           Boosting from existing prediction
 predict_first_ntree             Predicting using first n trees
--- a/R-package/demo/README.md
+++ b/R-package/demo/README.md
@@ -1,6 +1,7 @@
 XGBoost R Feature Walkthrough
 ====
-* [Basic walkthrough of wrappers](basic_walkthrough.R) 
+* [Basic walkthrough of wrappers](basic_walkthrough.R)
+* [Train a xgboost model from caret library](caret_wrapper.R)
 * [Cutomize loss function, and evaluation metric](custom_objective.R)
 * [Boosting from existing prediction](boost_from_prediction.R)
 * [Predicting using first n trees](predict_first_ntree.R)
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@@ -1,7 +1,7 @@
 require(xgboost)
 require(methods)
 # we load in the agaricus dataset
-# In this example, we are aiming to predict whether a mushroom can be eated
+# In this example, we are aiming to predict whether a mushroom can be eaten
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
@@ -12,8 +12,8 @@ class(train$data)

 #-------------Basic Training using XGBoost-----------------
 # this is the basic usage of xgboost you can put matrix in data field
-# note: we are puting in sparse matrix here, xgboost naturally handles sparse input
-# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
+# note: we are putting in sparse matrix here, xgboost naturally handles sparse input
+# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
 print("training xgboost with sparseMatrix")
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
               nthread = 2, objective = "binary:logistic")
@@ -22,7 +22,7 @@ print("training xgboost with Matrix")
 bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
               nthread = 2, objective = "binary:logistic")

-# you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features
+# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
 print("training xgboost with xgb.DMatrix")
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2, 
@@ -72,7 +72,7 @@ print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
 dtrain <- xgb.DMatrix(data = train$data, label=train$label)
 dtest <- xgb.DMatrix(data = test$data, label=test$label)
 #---------------Using watchlist----------------
-# watchlist is a list of xgb.DMatrix, each of them tagged with name
+# watchlist is a list of xgb.DMatrix, each of them is tagged with name
 watchlist <- list(train=dtrain, test=dtest)
 # to train with watchlist, use xgb.train, which contains more advanced features
 # watchlist allows us to monitor the evaluation result on all data in the list 
--- a/R-package/demo/caret_wrapper.R
+++ b/R-package/demo/caret_wrapper.R
@@ -0,0 +1,35 @@
+# install development version of caret library that contains xgboost models
+devtools::install_github("topepo/caret/pkg/caret") 
+require(caret)
+require(xgboost)
+require(data.table)
+require(vcd)
+require(e1071)
+
+# Load Arthritis dataset in memory.
+data(Arthritis)
+# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
+df <- data.table(Arthritis, keep.rownames = F)
+
+# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
+# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
+df[,AgeDiscret:= as.factor(round(Age/10,0))]
+
+# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
+df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
+
+# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
+df[,ID:=NULL]
+
+#-------------Basic Training using XGBoost in caret Library-----------------
+# Set up control parameters for caret::train
+# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
+fitControl <- trainControl(method = "cv", number = 10, repeats = 2, search = "random")
+# train a xgbTree model using caret::train
+model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)
+
+# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model using xgbLinear
+# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)
+
+# See model results
+print(model)
--- a/R-package/demo/create_sparse_matrix.R
+++ b/R-package/demo/create_sparse_matrix.R
@@ -7,7 +7,7 @@ if (!require(vcd)) {
 }
 # According to its documentation, Xgboost works only on numbers.
 # Sometimes the dataset we have to work on have categorical data. 
-# A categorical variable is one which have a fixed number of values. By exemple, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
+# A categorical variable is one which have a fixed number of values. By example, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
 #
 # In R, categorical variable is called Factor. 
 # Type ?factor in console for more information.
@@ -74,11 +74,11 @@ importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
 print(importance)
 # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).

-# Does these results make sense?
+# Does these result make sense?
 # Let's check some Chi2 between each of these features and the outcome.

 print(chisq.test(df$Age, df$Y))
-# Pearson correlation between Age and illness disapearing is 35
+# Pearson correlation between Age and illness disappearing is 35

 print(chisq.test(df$AgeDiscret, df$Y))
 # Our first simplification of Age gives a Pearson correlation of 8.
@@ -86,6 +86,6 @@ print(chisq.test(df$AgeDiscret, df$Y))
 print(chisq.test(df$AgeCat, df$Y))
 # The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but  for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-)

-# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
+# As you can see, in general destroying information by simplifying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
 # However it's almost always worse when you add some arbitrary rules.
 # Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
--- a/R-package/demo/runall.R
+++ b/R-package/demo/runall.R
@@ -9,3 +9,4 @@ demo(create_sparse_matrix)
 demo(predict_leaf_indices)
 demo(early_stopping)
 demo(poisson_regression)
+demo(caret_wrapper)
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -6,7 +6,8 @@
 \usage{
 xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
  feval = NULL, verbose = 1, print.every.n = 1L,
-  early.stop.round = NULL, maximize = NULL, ...)
+  early.stop.round = NULL, maximize = NULL, save_period = 0,
+  save_name = "xgboost.model", ...)
 }
 \arguments{
 \item{params}{the list of parameters.
@@ -87,6 +88,10 @@ keeps getting worse consecutively for \code{k} rounds.}
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}

+\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
+
+\item{save_name}{the name or path for periodically saved model file.}
+
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
--- a/R-package/man/xgboost.Rd
+++ b/R-package/man/xgboost.Rd
@@ -4,9 +4,10 @@
 \alias{xgboost}
 \title{eXtreme Gradient Boosting (Tree) library}
 \usage{
-xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
-  nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL,
-  maximize = NULL, ...)
+xgboost(data = NULL, label = NULL, missing = NULL, weight = NULL,
+  params = list(), nrounds, verbose = 1, print.every.n = 1L,
+  early.stop.round = NULL, maximize = NULL, save_period = 0,
+  save_name = "xgboost.model", ...)
 }
 \arguments{
 \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
@@ -18,6 +19,8 @@ if data is local data file or  \code{xgb.DMatrix}.}
 \item{missing}{Missing is only used when input is dense matrix, pick a float
 value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}

+\item{weight}{a vector indicating the weight for each row of the input.}
+
 \item{params}{the list of parameters.

 Commonly used ones are:
@@ -51,6 +54,10 @@ keeps getting worse consecutively for \code{k} rounds.}
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}

+\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
+
+\item{save_name}{the name or path for periodically saved model file.}
+
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{
--- a/R-package/tests/testthat.R
+++ b/R-package/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(xgboost)
+
+test_check("xgboost")
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -0,0 +1,33 @@
+require(xgboost)
+
+context("basic functions")
+
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+train = agaricus.train
+test = agaricus.test
+
+test_that("train and predict", {
+  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+                eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+  pred = predict(bst, test$data)
+})
+
+
+test_that("early stopping", {
+  res = xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
+               eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
+               early.stop.round = 3, maximize = FALSE)
+  expect_true(nrow(res)<20)
+  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+                eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
+                early.stop.round = 3, maximize = FALSE)
+  pred = predict(bst, test$data)
+})
+
+test_that("save_period", {
+  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+                eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
+                save_period = 10, save_name = "xgb.model")
+  pred = predict(bst, test$data)
+})
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -0,0 +1,47 @@
+context('Test models with custom objective')
+
+require(xgboost)
+
+test_that("custom objective works", {
+  data(agaricus.train, package='xgboost')
+  data(agaricus.test, package='xgboost')
+  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+  
+  watchlist <- list(eval = dtest, train = dtrain)
+  num_round <- 2
+  
+  logregobj <- function(preds, dtrain) {
+    labels <- getinfo(dtrain, "label")
+    preds <- 1/(1 + exp(-preds))
+    grad <- preds - labels
+    hess <- preds * (1 - preds)
+    return(list(grad = grad, hess = hess))
+  }
+  evalerror <- function(preds, dtrain) {
+    labels <- getinfo(dtrain, "label")
+    err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
+    return(list(metric = "error", value = err))
+  }
+  
+  param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
+                objective=logregobj, eval_metric=evalerror)
+  
+  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  expect_equal(class(bst), "xgb.Booster")
+  expect_equal(length(bst$raw), 1064)
+  attr(dtrain, 'label') <- getinfo(dtrain, 'label')
+  
+  logregobjattr <- function(preds, dtrain) {
+    labels <- attr(dtrain, 'label')
+    preds <- 1/(1 + exp(-preds))
+    grad <- preds - labels
+    hess <- preds * (1 - preds)
+    return(list(grad = grad, hess = hess))
+  }
+  param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
+                objective=logregobjattr, eval_metric=evalerror)
+  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  expect_equal(class(bst), "xgb.Booster")
+  expect_equal(length(bst$raw), 1064)
+})
--- a/R-package/tests/testthat/test_glm.R
+++ b/R-package/tests/testthat/test_glm.R
@@ -0,0 +1,19 @@
+context('Test generalized linear models')
+
+require(xgboost)
+
+test_that("glm works", {
+  data(agaricus.train, package='xgboost')
+  data(agaricus.test, package='xgboost')
+  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+  expect_equal(class(dtrain), "xgb.DMatrix")
+  expect_equal(class(dtest), "xgb.DMatrix")
+  param <- list(objective = "binary:logistic", booster = "gblinear",
+                nthread = 2, alpha = 0.0001, lambda = 1)
+  watchlist <- list(eval = dtest, train = dtrain)
+  num_round <- 2
+  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  ypred <- predict(bst, dtest)
+  expect_equal(length(getinfo(dtest, 'label')), 1611)
+})
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -0,0 +1,32 @@
+context('Test helper functions')
+
+require(xgboost)
+require(data.table)
+require(Matrix)
+require(vcd)
+
+data(Arthritis)
+data(agaricus.train, package='xgboost')
+df <- data.table(Arthritis, keep.rownames = F)
+df[,AgeDiscret:= as.factor(round(Age/10,0))]
+df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
+df[,ID:=NULL]
+sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
+output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
+bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
+               eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
+
+
+test_that("xgb.dump works", {
+  capture.output(print(xgb.dump(bst)))
+})
+
+test_that("xgb.importance works", {
+  xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+  importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
+  expect_equal(dim(importance), c(7, 4))
+})
+
+test_that("xgb.plot.tree works", {
+  xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
+})
--- a/R-package/tests/testthat/test_poisson_regression.R
+++ b/R-package/tests/testthat/test_poisson_regression.R
@@ -0,0 +1,13 @@
+context('Test poisson regression model')
+
+require(xgboost)
+
+test_that("poisson regression works", {
+  data(mtcars)
+  bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
+                objective='count:poisson',nrounds=5)
+  expect_equal(class(bst), "xgb.Booster")
+  pred = predict(bst,as.matrix(mtcars[,-11]))
+  expect_equal(length(pred), 32)
+  sqrt(mean((pred-mtcars[,11])^2))
+})
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Contents
 * [Build Instruction](doc/build.md)
 * [Features](#features)
 * [Distributed XGBoost](multi-node)
-* [Usecases](doc/README.md#highlight-links)
+* [Usecases](doc/index.md#highlight-links)
 * [Bug Reporting](#bug-reporting)
 * [Contributing to XGBoost](#contributing-to-xgboost)
 * [Committers and Contributors](CONTRIBUTORS.md)
@@ -29,8 +29,9 @@ Contents
 What's New
 ----------

+* XGBoost helps Owen Zhang to win the [Avito Context Ad Click competition](https://www.kaggle.com/c/avito-context-ad-clicks). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/08/26/avito-winners-interview-1st-place-owen-zhang/).
 * XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance)
-  Check out the [winning solution](doc/README.md#highlight-links)
+  Check out the [winning solution](https://github.com/ChenglongChen/Kaggle_CrowdFlower)
 * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
 * XGBoost helps three champion teams to win [WWW2015  Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
  Check out the [winning solution](doc/README.md#highlight-links)
--- a/build.sh
+++ b/build.sh
@@ -6,6 +6,18 @@

 # See additional instruction in doc/build.md

+#for building static OpenMP lib in MAC for easier installation in MAC
+#doesn't work with XCode clang/LLVM since Apple doesn't support, 
+#needs brew install gcc 4.9+ with OpenMP. By default the static link is OFF
+static_omp=0
+if ((${static_omp}==1)); then
+    rm libgomp.a
+    ln -s `g++ -print-file-name=libgomp.a`
+    make clean
+    make omp_mac_static=1
+    echo "Successfully build multi-thread static link xgboost"
+    exit 0
+fi

 if make; then
    echo "Successfully build multi-thread xgboost"
--- a/demo/guide-python/README.md
+++ b/demo/guide-python/README.md
@@ -7,5 +7,6 @@ XGBoost Python Feature Walkthrough
 * [Generalized Linear Model](generalized_linear_model.py)
 * [Cross validation](cross_validation.py)
 * [Predicting leaf indices](predict_leaf_indices.py)
-* [Sklearn Wrapper](sklearn_example.py)
+* [Sklearn Wrapper](sklearn_examples.py)
+* [Sklearn Parallel](sklearn_parallel.py)
 * [External Memory](external_memory.py)
--- a/demo/guide-python/runall.sh
+++ b/demo/guide-python/runall.sh
@@ -2,7 +2,11 @@
 python basic_walkthrough.py
 python custom_objective.py
 python boost_from_prediction.py
+python predict_first_ntree.py
 python generalized_linear_model.py
 python cross_validation.py
 python predict_leaf_indices.py
+python sklearn_examples.py
+python sklearn_parallel.py
+python external_memory.py
 rm -rf *~ *.model *.buffer 
--- a/doc/README
+++ b/doc/README
@@ -1,5 +1,7 @@
 The document of xgboost is generated with recommonmark and sphinx.

 You can build it locally by typing "make html" in this folder.
- You will need to rerun the recommonmark script for readthedocs in sphinx_util.
- This was a hack to get the customized parser into readthedocs, hopefully to be removed in future.
+- clone https://github.com/tqchen/recommonmark to root
+- type make html
+
+Checkout https://recommonmark.readthedocs.org for guide on how to write markdown with extensions used in this doc, such as math formulas and table of content.
--- a/doc/dev-guide/contribute.md
+++ b/doc/dev-guide/contribute.md
@@ -5,7 +5,7 @@ Everyone is more than welcomed to is a great way to make the project better.
 The project is maintained by a committee of [committers](../../CONTRIBUTORS.md#comitters) who will review and merge pull requests from contributors.

 Contributing Code
-=================
+-----------------
 * The C++ code follows Google C++ style
 * We follow numpy style to document our python module
 * Tools to precheck codestyle
--- a/doc/faq.md
+++ b/doc/faq.md
@@ -6,6 +6,10 @@ How to tune parameters
 ----------------------
 See [Parameter Tunning Guide](param_tuning.md)

+Description on the model
+------------------------
+See [Introduction to Boosted Trees](model.md)
+

 I have a big dataset
 --------------------
--- a/doc/img/cart.png
+++ b/doc/img/cart.png
--- a/doc/img/split_find.png
+++ b/doc/img/split_find.png
--- a/doc/img/step_fit.png
+++ b/doc/img/step_fit.png
--- a/doc/img/struct_score.png
+++ b/doc/img/struct_score.png
--- a/doc/img/twocart.png
+++ b/doc/img/twocart.png
--- a/doc/index.md
+++ b/doc/index.md
@@ -19,7 +19,7 @@ The best way to get started to learn xgboost is by the examples. There are three

 After you gets familiar with the interface, checkout the following additional resources
 * [Frequently Asked Questions](faq.md)
-* [Learning what is in Behind: Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
+* [Learning what is in Behind: Introduction to Boosted Trees](model.md)
 * [User Guide](#user-guide) contains comprehensive list of documents of xgboost.
 * [Developer Guide](dev-guide/contribute.md)

@@ -38,6 +38,7 @@ are great resources to learn xgboost by real examples. If you think you have som
 * [Understanding XGBoost Model on Otto Dataset](../demo/kaggle-otto/understandingXGBoostModel.Rmd) (R package)
  - This tutorial teaches you how to use xgboost to compete kaggle otto challenge.

+
 Highlight Solutions
 -------------------
 This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request.
@@ -51,7 +52,7 @@ This section is about blogposts, presentation and videos discussing how to use x
 User Guide
 ----------
 * [Frequently Asked Questions](faq.md)
-* [Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
+* [Introduction to Boosted Trees](model.md)
 * [Using XGBoost in Python](python/python_intro.md)
 * [Using XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd)
 * [Learning to use XGBoost by Example](../demo)
@@ -61,7 +62,6 @@ User Guide
 * [Parameters](parameter.md)
 * [Notes on Parameter Tunning](param_tuning.md)

-
 Developer Guide
 ---------------
 * [Developer Guide](dev-guide/contribute.md)
@@ -69,4 +69,3 @@ Developer Guide
 API Reference
 -------------
 * [Python API Reference](python/python_api.rst)
-
--- a/doc/model.md
+++ b/doc/model.md
@@ -0,0 +1,232 @@
+Introduction to Boosted Trees
+=============================
+XGBoost is short for "Extreme Gradient Boosting", where the term "Gradient Boosting" is proposed in the paper _Greedy Function Approximation: A Gradient Boosting Machine_, Friedman. Based on this original model. This is a tutorial on boosted trees, most of content are based on this [slide](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) by the author of xgboost.
+
+The GBM(boosted trees) has been around for really a while, and there are a lot of materials on the topic. This tutorial tries to explain boosted trees in a self-contained and principled way of supervised learning. We think this explaination is cleaner, more formal, and motivates the variant used in xgboost.
+
+Elements of Supervised Learning
+-------------------------------
+XGBoost is used for supervised learning problems, where we use the training data ``$ x_i $`` to predict a target variable ``$ y_i $``.
+Before we get dived into trees, let us start from reviwing the basic elements in supervised learning.
+
+### Model and Parameters
+The ***model*** in supervised learning usually refers to the mathematical structure on how to given the prediction ``$ y_i $`` given ``$ x_i $``.
+For example, a common model is *linear model*, where the prediction is given by ``$ \hat{y}_i = \sum_j w_j x_{ij} $``, a linear combination of weighted input features.
+The prediction value can have different interpretations, depending on the task.
+For example, it can be logistic transformed to get the probability of postitive class in logistic regression, it can also be used as ranking score when we want to rank the outputs.
+
+The ***parameters*** are the undermined part that we need to learn from data. In linear regression problem, the parameters are the co-efficients ``$ w $``.
+Usually we will use ``$ \Theta $`` to denote the parameters.
+
+### Object Function : Training Loss + Regularization
+
+Based on different understanding or assumption of ``$ y_i $``, we can have different problems as regression, classification, ordering, etc.
+We need to find a way to find the best parameters given the training data. In order to do so, we need to define a so called ***objective function***,
+to measure the performance of the model under certain set of parameters.
+
+A very important about objective functions, is they ***must always*** contains two parts: training loss and regularization.
+
+```math
+Obj(\Theta) = L(\Theta) + \Omega(\Theta)
+```
+
+where ``$ L $`` is the training loss function, and ``$ \Omega $`` is the regularization term. The training loss measures how *predictive* our model is on training data.
+For example, a commonly used training loss is mean squared error.
+
+```math
+L(\Theta) = \sum_i (y_i-\hat{y}_i)^2
+```
+Another commonly used loss function is logistic loss for logistic regression
+
+```math
+L(\theta) = \sum_i[ y_i\ln (1+e^{-\hat{y}_i}) + (1-y_i)\ln (1+e^{\hat{y}_i})]
+```
+
+The ***regularization term*** is usually people forget to add. The regularization term controls the complexity of the model, this helps us to avoid overfitting.
+This sounds a bit abstract, let us consider the following problem in the following picture. You are asked to *fit* visually a step function given the input data points
+on the upper left corner of the image, which solution among the tree you think is the best fit?
+
+![Step function](img/step_fit.png)
+
+The answer is already marked as red. Please think if it is reasonable to you visually. The general principle is we want a ***simple*** and ***predictive*** model.
+The tradeoff between the two is also referred as bias-variance tradeoff in machine learning.
+
+
+### Why introduce the general principle
+The elements introduced in above forms the basic elements of supervised learning, and they are naturally the building blocks of machine learning toolkits.
+For example, you should be able to answer what is the difference and common parts between boosted trees and random forest.
+Understanding the process in a formalized way also helps us to understand the objective what we are learning and getting the reason behind the heurestics such as
+pruning and smoothing.
+
+Tree Ensemble
+-------------
+Now we have introduce the elements of supervised learning, let us getting started with real trees.
+To begin with, let us first learn what is the ***model*** of xgboost: tree ensembles.
+The tree ensemble model is a set of classification and regression trees (CART). Here's a simple example of a CART
+that classifies is someone will like computer games.
+
+![CART](img/cart.png)
+
+We classify the members in thie family into different leaves, and assign them the score on corresponding leaf.
+A CART is a bit different from decision trees, where the leaf only contain decision values. In CART, a real score
+is associated with each of the leaves, this allows gives us richer interpretations that go beyond classification.
+This also makes the unified optimization step easier, as we will see in later part of this tutorial.
+
+Usually, a single tree is not so strong enough to be used in practice. What is actually used is the so called
+tree ensemble model, that sumes the prediction of multiple trees together.
+
+![TwoCART](img/twocart.png)
+
+Here is an example of tree ensemble of two trees. The prediction scores of each individual tree are summed up to get the final score.
+If you look at the example, an important fact is that the two trees tries to *complement* each other.
+Mathematically, we can write our model into the form
+
+```math
+\hat{y}_i = \sum_{k=1}^K f_k(x_i), f_k \in \mathcal{F}
+```
+
+where ``$ K $`` is the number of trees, ``$ f $`` is a function in the functional space ``$ \mathcal{F} $``, and ``$ \mathcal{F} $`` is the set of all possible CARTs. Therefore our objective to optimize can be written as
+
+```math
+obj(\Theta) = \sum_i^n l(y_i, \hat{y}_i) + \sum_{k=1}^K \Omega(f_k)
+```
+Now here comes the question, what is the *model* of random forest? It is exactly tree ensembles! So random forest and boosted trees are not different in terms of model,
+the difference is how we train them. This means if you write a predictive service of tree ensembles, you only need to write one of them and they should directly work
+for both random forest and boosted trees. One example of elements of supervised learning rocks.
+
+Tree Boosting
+-------------
+After introducing the model, let us begin with the real training part. How should we learn the trees?
+The answer is, as is always for all supervised learning models: *define an objective function, and optimize it*!
+
+Assume we have the following objective function (remember it always need to contain training loss, and regularization)
+```math
+Obj = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\Omega(f_i) \\
+```
+
+### Additive Training
+
+First thing we want to ask is what are ***parameters*** of trees. You can find what we need to learn are those functions ``$f_i$``, with each contains the structure
+of the tree, and the leaf score. This is much harder than traditional optimization problem where you can take the gradient and go.
+It is not easy to train all the trees at once.
+Instead, we use an additive strategy: fix what we have learned, add a new tree at a time.
+We note the prediction value at step ``$t$`` by ``$ \hat{y}_i^{(t)}$``, so we have
+
+```math
+\hat{y}_i^{(0)} &= 0\\
+\hat{y}_i^{(1)} &= f_1(x_i) = \hat{y}_i^{(0)} + f_1(x_i)\\
+\hat{y}_i^{(2)} &= f_1(x_i) + f_2(x_i)= \hat{y}_i^{(1)} + f_2(x_i)\\
+&\dots\\
+\hat{y}_i^{(t)} &= \sum_{k=1}^t f_k(x_i)= \hat{y}_i^{(t-1)} + f_t(x_i)
+```
+
+It remains to ask Which tree do we want at each step?  A natural thing is to add the one that optimizes our objective.
+
+```math
+Obj^{(t)} & = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\Omega(f_i) \\
+          & = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t-1)} + f_t(x_i)) + \Omega(f_t) + constant
+```
+
+If we  consider using MSE as our loss function, it becomes the following form.
+
+```math
+Obj^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{i=1}^t\Omega(f_i) \\
+          & = \sum_{i=1}^n [2(\hat{y}_i^{(t-1)} - y_i)f_t(x_i) + f_t(x_i)^2] + \Omega(f_t) + constant
+```
+
+The form of MSE is friendly, with a first order term (usually called residual) and a quadratic term.
+For other loss of interest (for example, logistic loss), it is not so easy to get such a nice form.
+So in general case, we take the Taylor expansion of the loss function up to the second order
+
+```math
+Obj^{(t)} = \sum_{i=1}^n [l(y_i, \hat{y}_i^{(t-1)}) + g_i f_t(x_i) + \frac{1}{2} h_i f_t^2(x_i)] + \Omega(f_t) + constant
+```
+where the ``$g_i$`` and ``$h_i$`` are defined as
+
+```math
+g_i &= \partial_{\hat{y}_i^{(t)}} l(y_i, \hat{y}_i^{(t-1)})\\
+h_i &= \partial_{\hat{y}_i^{(t)}}^2 l(y_i, \hat{y}_i^{(t-1)})
+```
+
+After we remove all the constants, the specific objective at t step becomes
+
+```math
+\sum_{i=1}^n [g_i f_t(x_i) + \frac{1}{2} h_i f_t^2(x_i)] + \Omega(f_t)
+```
+
+This becomes our optimization goal for the new tree. One important advantage of this definition, is that
+it only depends on ``$g_i$`` and ``$h_i$``, this is how xgboost allows support of customization of loss functions.
+We can optimized every loss function, including logistic regression, weighted logistic regression, using the exactly
+the same solver that takes ``$g_i$`` and ``$h_i$`` as input!
+
+### Model Complexity
+We have introduced the training step, but wait, there is one important thing, the ***regularization***!
+We need to define the complexity of the tree ``$\Omega(f)$``. In order to do so, let us first refine the definition of the tree a tree ``$ f(x) $`` as
+
+```math
+f_t(x) = w_{q(x)}, w \in R^T, q:R^d\rightarrow \{1,2,\cdots,T\} .
+```
+
+Here ``$ w $`` is the vector of scores on leaves, ``$ q $`` is a function assigning each data point to the corresponding leaf and``$ T $`` is the number of leaves.
+In XGBoost, we define the complexity as
+
+```math
+\Omega(f) = \gamma T + \frac{1}{2}\lambda \sum_{j=1}^T w_j^2
+```
+Of course there is more than one way to define the complexity, but this specific one works well in practice. The regularization is one part most tree packages takes
+less carefully, or simply ignore. This was due to the traditional treatment tree learning only emphasize improving impurity, while the complexity control part
+are more lies as part of heuristics. By defining it formally, we can get a better idea of what we are learning, and yes it works well in practice.
+
+### The Structure Score
+
+Here is the magical part of the derivation. After reformalizing the tree model, we can write the objective value with the ``$ t$``-th tree as:
+
+```math
+Obj^{(t)} &\approx \sum_{i=1}^n [g_i w_q(x_i) + \frac{1}{2} h_i w_{q(x_i)}^2] + \gamma T + \frac{1}{2}\lambda \sum_{j=1}^T w_j^2\\
+&= \sum^T_{j=1} [(\sum_{i\in I_j} g_i) w_j + \frac{1}{2} (\sum_{i\in I_j} h_i + \lambda) w_j^2 ] + \gamma T
+```
+
+where ``$ I_j = \{i|q(x_i)=j\} $`` is the set of indices of data points assigned to the ``$ j $``-th leaf. Notice that in the second line we have change the index of the summation because all the data points on the same leaf get the same score. We could further compress the expression by defining ``$ G_j = \sum_{i\in I_j} g_i $`` and ``$ H_j = \sum_{i\in I_j} h_i $``:
+
+```math
+Obj^{(t)} = \sum^T_{j=1} [G_jw_j + \frac{1}{2} (H_j+\lambda) w_j^2] +\gamma T
+```
+
+In this equation ``$ w_j $`` are independent to each other, the form ``$ G_jw_j+\frac{1}{2}(H_j+\lambda)w_j^2 $`` is quadratic and the best ``$ w_j $`` for a given structure ``$q(x)$`` and the best objective reduction we can get:
+
+```math
+w_j^\ast = -\frac{G_j}{H_j+\lambda}\\
+Obj^\ast = -\frac{1}{2} \sum_{j=1}^T \frac{G_j^2}{H_j+\lambda} + \gamma T
+```
+The last equation measures ***how good*** a tree structure ``$q(x)$`` is.
+
+![Structure Score](img/struct_score.png)
+
+If all these sounds a bit complicated. Let us take a look the the picture, and see how the scores can be calculated.
+Basically, for a given tree structure, we push the statistics ``$g_i$`` and ``$h_i$`` to the leaves they belong to,
+sum the statistics together, and use the formula to calulate how good the tree is.
+This score is like impurity measure in decision tree, except that it also takes the model complexity into account.
+
+### Learn the tree structure
+Now we have a way to measure how good a tree is ideally we can enumerate all possible trees and pick the best one.
+In practice it is impossible, so we will try to one level of the tree at a time.
+Specifically we try to split a leaf into two leaves, and the score it gains is
+
+```math
+Gain = \frac{1}{2} \left[\frac{G_L^2}{H_L+\lambda}+\frac{G_R^2}{H_R+\lambda}-\frac{(G_L+G_R)^2}{H_L+H_R+\lambda}\right] - \gamma
+```
+This formula can be decomposited as 1) the score on the new left leaf 2) the score on the new right leaf 3) The score on the original leaf 4) regularization on the additional leaf.
+We can find an important fact here: if the gain is smaller than ``$\gamma$``, we would better not to add that branch. This is exactly the ***prunning*** techniques in tree based
+models! By using the principles of supervised learning, we can naturally comes up with the reason these techniques :)
+
+For real valued data, we usually want to search for an optimal split. To efficiently doing so, we place all the instances in a sorted way, like the following picture.
+![Best split](img/split_find.png)
+Then a left to right scan is sufficient to calculate the structure score of all possible split solutions, and we can find the best split efficiently.
+
+Final words on XGBoost
+----------------------
+Now you have understand what is a boosted tree, you may ask, where is the introduction on [XGBoost](https://github.com/dmlc/xgboost)?
+XGBoost is exactly a tool motivated by the formal principle introduced in this tutorial!
+More importantly, it is developed with both deep consideration in terms of ***systems optimization*** and ***principles in machine learning***.
+The goal of this library is to push the extreme of the computation limits of machines to provide a ***scalable***, ***portable*** and ***accurate*** library.
+Make sure you [try it out](https://github.com/dmlc/xgboost), and most importantly, contribute your piece of wisdom (code, examples, tutorials) to the community!
--- a/doc/parameter.md
+++ b/doc/parameter.md
@@ -46,6 +46,10 @@ Parameters for Tree Booster
 * colsample_bytree [default=1]
  - subsample ratio of columns when constructing each tree.
  - range: (0,1]
+* lambda [default=1]
+  - L2 regularization term on weights
+* alpha [default=0]
+  - L1 regularization term on weights

 Parameters for Linear Booster
 -----------------------------
@@ -105,7 +109,7 @@ The following parameters are only used in the console version of xgboost
 * task [default=train] options: train, pred, eval, dump
  - train: training using data
  - pred: making prediction for test:data
-  - eval: for evaluating statistics specified by eval[name]=filenam
+  - eval: for evaluating statistics specified by eval[name]=filename
  - dump: for dump the learned model into text format(preliminary)
 * model_in [default=NULL]
  - path to input model, needed for test, eval, dump, if it is specified in training, xgboost will continue training from the input model
--- a/doc/python-requirements.txt
+++ b/doc/python-requirements.txt
@@ -1,2 +0,0 @@
-commonmark
-
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -35,3 +35,13 @@ Scikit-Learn API
 .. autoclass:: xgboost.XGBClassifier
    :members:
    :show-inheritance:
+
+Plotting API
+------------
+.. automodule:: xgboost.plotting
+
+.. autofunction:: xgboost.plot_importance
+
+.. autofunction:: xgboost.plot_tree
+
+.. autofunction:: xgboost.to_graphviz
--- a/doc/python/python_intro.md
+++ b/doc/python/python_intro.md
@@ -127,3 +127,27 @@ If early stopping is enabled during training, you can predict with the best iter
 ```python
 ypred = bst.predict(xgmat,ntree_limit=bst.best_iteration)
 ```
+
+Plotting
+--------
+
+You can use plotting module to plot importance and output tree.
+
+To plot importance, use ``plot_importance``. This function requires ``matplotlib`` to be installed.
+
+```python
+xgb.plot_importance(bst)
+```
+
+To output tree via ``matplotlib``, use ``plot_tree`` specifying ordinal number of the target tree.
+This function requires ``graphviz`` and ``matplotlib``.
+
+```python
+xgb.plot_tree(bst, num_trees=2)
+```
+
+When you use ``IPython``, you can use ``to_graphviz`` function which converts the target tree to ``graphviz`` instance. ``graphviz`` instance is automatically rendered on ``IPython``.
+
+```python
+xgb.to_graphviz(bst, num_trees=2)
+```
--- a/doc/sphinx_util.py
+++ b/doc/sphinx_util.py
@@ -6,12 +6,12 @@ import docutils
 import subprocess

 if os.environ.get('READTHEDOCS', None) == 'True':
-    subprocess.call('cd ..; rm -rf recommonmark recom;' +
-                    'git clone https://github.com/tqchen/recommonmark;' +
-                    'mv recommonmark/recommonmark recom', shell=True)
+    subprocess.call('cd ..; rm -rf recommonmark;' +
+                    'git clone https://github.com/tqchen/recommonmark', shell=True)

-sys.path.insert(0, os.path.abspath('..'))
-from recom import parser, transform
+sys.path.insert(0, os.path.abspath('../recommonmark/'))
+
+from recommonmark import parser, transform

 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
--- a/python-package/MANIFEST.in
+++ b/python-package/MANIFEST.in
@@ -0,0 +1,7 @@
+include *.sh *.md
+recursive-include xgboost *
+recursive-include xgboost/wrapper *
+recursive-include xgboost/windows *
+recursive-include xgboost/subtree *
+recursive-include xgboost/src *
+recursive-include xgboost/multi-node *
--- a/python-package/README.md
+++ b/python-package/README.md
@@ -1,7 +1,27 @@
 XGBoost Python Package
 ======================
+Installation
+------------
+We are on [PyPI](https://pypi.python.org/pypi/xgboost) now. For stable version, please install using pip:
+
+* ```pip install xgboost```
+* Note for windows users: this pip installation may not work on some windows environment, and it may cause unexpected errors. pip installation on windows is currently disabled for further invesigation, please install from github.
+
+For up-to-date version, please install from github.
+
 * To make the python module, type ```./build.sh``` in the root directory of project
 * Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools)
 * Install with `python setup.py install` from this directory.
+* For windows users, please use the Visual Studio project file under [windows folder](../windows/). See also the [installation tutorial](https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/13043/run-xgboost-from-windows-and-python) from Kaggle Otto Forum.
+
+Examples
+------
+
 * Refer also to the walk through example in [demo folder](../demo/guide-python)
-* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo.
+* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.py) on this dataset.
+
+Note
+-----
+
+* If you want to build xgboost on Mac OS X with multiprocessing support where clang in XCode by default doesn't support, please install gcc 4.9 or higher using [homebrew](http://brew.sh/) ```brew tap homebrew/versions; brew install gcc49```
+* If you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the [sklearn_parallel.py](../demo/guide-python/sklearn_parallel.py) demo.
--- a/python-package/setup.cfg
+++ b/python-package/setup.cfg
@@ -0,0 +1,2 @@
+[metadata]
+description-file = README.md
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -2,20 +2,48 @@
 """Setup xgboost package."""
 from __future__ import absolute_import
 import sys
-from setuptools import setup
+from setuptools import setup, find_packages
+import subprocess
 sys.path.insert(0, '.')
+
+import os
+#build on the fly if install in pip
+#otherwise, use build.sh in the parent directory
+
+if 'pip' in __file__:
+    if not os.name == 'nt': #if not windows
+        build_sh = subprocess.Popen(['sh', 'xgboost/build-python.sh'])
+        build_sh.wait()
+        output = build_sh.communicate()
+        print(output)
+
 import xgboost

 LIB_PATH = xgboost.core.find_lib_path()
+#print LIB_PATH

+#to deploy to pip, please use
+#make pythonpack
+#python setup.py register sdist upload
+#and be sure to test it firstly using "python setup.py register sdist upload -r pypitest"
 setup(name='xgboost',
      version=xgboost.__version__,
+      #version='0.4a13',
      description=xgboost.__doc__,
      install_requires=[
          'numpy',
          'scipy',
      ],
+      maintainer='Hongliang Liu',
+      maintainer_email='phunter.lau@gmail.com',
      zip_safe=False,
-      packages=['xgboost'],
-      data_files=[('xgboost', [LIB_PATH[0]])],
+      packages=find_packages(),
+      #don't need this and don't use this, give everything to MANIFEST.in
+      #package_dir = {'':'xgboost'},
+      #package_data = {'': ['*.txt','*.md','*.sh'],
+      #               }
+      #this will use MANIFEST.in during install where we specify additional files,
+      #this is the golden line
+      include_package_data=True,
+      data_files=[('xgboost', LIB_PATH)],
      url='https://github.com/dmlc/xgboost')
--- a/python-package/xgboost/init.py
+++ b/python-package/xgboost/init.py
@@ -8,9 +8,11 @@ from __future__ import absolute_import
 from .core import DMatrix, Booster
 from .training import train, cv
 from .sklearn import XGBModel, XGBClassifier, XGBRegressor
+from .plotting import plot_importance, plot_tree, to_graphviz

 __version__ = '0.4'

 __all__ = ['DMatrix', 'Booster',
           'train', 'cv',
-           'XGBModel', 'XGBClassifier', 'XGBRegressor']
+           'XGBModel', 'XGBClassifier', 'XGBRegressor',
+           'plot_importance', 'plot_tree', 'to_graphviz']
--- a/python-package/xgboost/build-python.sh
+++ b/python-package/xgboost/build-python.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# This is a simple script to make xgboost in MAC and Linux for python wrapper only
+# Basically, it first try to make with OpenMP, if fails, disable OpenMP and make it again.
+# This will automatically make xgboost for MAC users who don't have OpenMP support.
+# In most cases, type make will give what you want.
+
+# See additional instruction in doc/build.md
+
+# note: this script is build for python package only, and it might have some filename
+#       conflict with build.sh which is for everything. 
+
+
+pushd xgboost
+if make python; then
+    echo "Successfully build multi-thread xgboost"
+else
+    echo "-----------------------------"
+    echo "Building multi-thread xgboost failed"
+    echo "Start to build single-thread xgboost"
+    make clean
+    make python no_omp=1
+    echo "Successfully build single-thread xgboost"
+    echo "If you want multi-threaded version"
+    echo "See additional instructions in doc/build.md"
+fi
+popd
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1,5 +1,5 @@
 # coding: utf-8
-# pylint: disable=too-many-arguments
+# pylint: disable=too-many-arguments, too-many-branches
 """Core XGBoost Library."""
 from __future__ import absolute_import

@@ -23,8 +23,9 @@ class XGBoostError(Exception):


 if sys.version_info[0] == 3:
-    # pylint: disable=invalid-name
+    # pylint: disable=invalid-name, redefined-builtin
    STRING_TYPES = str,
+    unicode = str
 else:
    # pylint: disable=invalid-name
    STRING_TYPES = basestring,
@@ -39,12 +40,18 @@ def find_lib_path():
       List of all found library path to xgboost
    """
    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/')]
+    #make pythonpack hack: copy this directory one level upper for setup.py
+    dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/')
+                , os.path.join(curr_path, './wrapper/')]
    if os.name == 'nt':
        if platform.architecture()[0] == '64bit':
            dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/'))
+            #hack for pip installation when copy all parent source directory here
+            dll_path.append(os.path.join(curr_path, './windows/x64/Release/'))
        else:
            dll_path.append(os.path.join(curr_path, '../../windows/Release/'))
+            #hack for pip installation when copy all parent source directory here
+            dll_path.append(os.path.join(curr_path, './windows/Release/'))
    if os.name == 'nt':
        dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
    else:
@@ -125,7 +132,11 @@ class DMatrix(object):
    which is optimized for both memory efficiency and training speed.
    You can construct DMatrix from numpy.arrays
    """
-    def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
+
+    feature_names = None  # for previous version's pickle
+
+    def __init__(self, data, label=None, missing=0.0,
+                 weight=None, silent=False, feature_names=None):
        """
        Data matrix used in XGBoost.

@@ -143,6 +154,8 @@ class DMatrix(object):
            Weight for each instance.
        silent : boolean, optional
            Whether print messages during construction
+        feature_names : list, optional
+            Labels for features.
        """
        # force into void_p, mac need to pass things in as void_p
        if data is None:
@@ -170,6 +183,21 @@ class DMatrix(object):
        if weight is not None:
            self.set_weight(weight)

+        # validate feature name
+        if not feature_names is None:
+            if not isinstance(feature_names, list):
+                feature_names = list(feature_names)
+            if len(feature_names) != len(set(feature_names)):
+                raise ValueError('feature_names must be unique')
+            if len(feature_names) != self.num_col():
+                msg = 'feature_names must have the same length as data'
+                raise ValueError(msg)
+            # prohibit to use symbols may affect to parse. e.g. ``[]=.``
+            if not all(isinstance(f, STRING_TYPES) and f.isalnum()
+                       for f in feature_names):
+                raise ValueError('all feature_names must be alphanumerics')
+        self.feature_names = feature_names
+
    def _init_from_csr(self, csr):
        """
        Initialize data from a CSR matrix.
@@ -385,6 +413,18 @@ class DMatrix(object):
                                         ctypes.byref(ret)))
        return ret.value

+    def num_col(self):
+        """Get the number of columns (features) in the DMatrix.
+
+        Returns
+        -------
+        number of columns : int
+        """
+        ret = ctypes.c_uint()
+        _check_call(_LIB.XGDMatrixNumCol(self.handle,
+                                         ctypes.byref(ret)))
+        return ret.value
+
    def slice(self, rindex):
        """Slice the DMatrix and return a new DMatrix that only contains `rindex`.

@@ -398,7 +438,7 @@ class DMatrix(object):
        res : DMatrix
            A new DMatrix containing only selected indices.
        """
-        res = DMatrix(None)
+        res = DMatrix(None, feature_names=self.feature_names)
        res.handle = ctypes.c_void_p()
        _check_call(_LIB.XGDMatrixSliceDMatrix(self.handle,
                                               c_array(ctypes.c_int, rindex),
@@ -413,6 +453,9 @@ class Booster(object):
    Booster is the model of xgboost, that contains low level routines for
    training, prediction and evaluation.
    """
+
+    feature_names = None
+
    def __init__(self, params=None, cache=(), model_file=None):
        # pylint: disable=invalid-name
        """Initialize the Booster.
@@ -429,6 +472,7 @@ class Booster(object):
        for d in cache:
            if not isinstance(d, DMatrix):
                raise TypeError('invalid cache item: {}'.format(type(d).__name__))
+            self._validate_feature_names(d)
        dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
        self.handle = ctypes.c_void_p()
        _check_call(_LIB.XGBoosterCreate(dmats, len(cache), ctypes.byref(self.handle)))
@@ -513,6 +557,8 @@ class Booster(object):
        """
        if not isinstance(dtrain, DMatrix):
            raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
+        self._validate_feature_names(dtrain)
+
        if fobj is None:
            _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
        else:
@@ -537,6 +583,8 @@ class Booster(object):
            raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
        if not isinstance(dtrain, DMatrix):
            raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
+        self._validate_feature_names(dtrain)
+
        _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle,
                                               c_array(ctypes.c_float, grad),
                                               c_array(ctypes.c_float, hess),
@@ -566,6 +614,8 @@ class Booster(object):
                    raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__))
                if not isinstance(d[1], STRING_TYPES):
                    raise TypeError('expected string, got {}'.format(type(d[1]).__name__))
+                self._validate_feature_names(d[0])
+
            dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
            evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
            msg = ctypes.c_char_p()
@@ -599,6 +649,7 @@ class Booster(object):
        result: str
            Evaluation result string.
        """
+        self._validate_feature_names(data)
        return self.eval_set([(data, name)], iteration)

    def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False):
@@ -636,6 +687,9 @@ class Booster(object):
            option_mask |= 0x01
        if pred_leaf:
            option_mask |= 0x02
+
+        self._validate_feature_names(data)
+
        length = ctypes.c_ulong()
        preds = ctypes.POINTER(ctypes.c_float)()
        _check_call(_LIB.XGBoosterPredict(self.handle, data.handle,
@@ -725,16 +779,46 @@ class Booster(object):
        """
        Returns the dump the model as a list of strings.
        """
+
        length = ctypes.c_ulong()
        sarr = ctypes.POINTER(ctypes.c_char_p)()
-        _check_call(_LIB.XGBoosterDumpModel(self.handle,
-                                            c_str(fmap),
-                                            int(with_stats),
-                                            ctypes.byref(length),
-                                            ctypes.byref(sarr)))
+        if self.feature_names is not None and fmap == '':
+            flen = int(len(self.feature_names))
+            fname = (ctypes.c_char_p * flen)()
+            ftype = (ctypes.c_char_p * flen)()
+
+            # supports quantitative type only
+            # {'q': quantitative, 'i': indicator}
+            if sys.version_info[0] == 3:
+                features = [bytes(f, 'utf-8') for f in self.feature_names]
+                types = [bytes('q', 'utf-8')] * flen
+            else:
+                features = [f.encode('utf-8') if isinstance(f, unicode) else f
+                            for f in self.feature_names]
+                types = ['q'] * flen
+
+            fname[:] = features
+            ftype[:] = types
+            _check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle,
+                                                            flen,
+                                                            fname,
+                                                            ftype,
+                                                            int(with_stats),
+                                                            ctypes.byref(length),
+                                                            ctypes.byref(sarr)))
+        else:
+            _check_call(_LIB.XGBoosterDumpModel(self.handle,
+                                                c_str(fmap),
+                                                int(with_stats),
+                                                ctypes.byref(length),
+                                                ctypes.byref(sarr)))
+
        res = []
        for i in range(length.value):
-            res.append(str(sarr[i].decode('ascii')))
+            try:
+                res.append(str(sarr[i].decode('ascii')))
+            except UnicodeDecodeError:
+                res.append(unicode(sarr[i].decode('utf-8')))
        return res

    def get_fscore(self, fmap=''):
@@ -759,3 +843,17 @@ class Booster(object):
                else:
                    fmap[fid] += 1
        return fmap
+
+    def _validate_feature_names(self, data):
+        """
+        Validate Booster and data's feature_names are identical
+        """
+        if self.feature_names is None:
+            self.feature_names = data.feature_names
+        else:
+            # Booster can't accept data with different feature names
+            if self.feature_names != data.feature_names:
+                msg = 'feature_names mismatch: {0} {1}'
+                raise ValueError(msg.format(self.feature_names,
+                                            data.feature_names))
+
--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -0,0 +1,223 @@
+# coding: utf-8
+# pylint: disable=too-many-locals, too-many-arguments, invalid-name,
+# pylint: disable=too-many-branches
+"""Plotting Library."""
+from __future__ import absolute_import
+
+import re
+import numpy as np
+from .core import Booster
+
+from io import BytesIO
+
+def plot_importance(booster, ax=None, height=0.2,
+                    xlim=None, title='Feature importance',
+                    xlabel='F score', ylabel='Features',
+                    grid=True, **kwargs):
+
+    """Plot importance based on fitted trees.
+
+    Parameters
+    ----------
+    booster : Booster or dict
+        Booster instance, or dict taken by Booster.get_fscore()
+    ax : matplotlib Axes, default None
+        Target axes instance. If None, new figure and axes will be created.
+    height : float, default 0.2
+        Bar height, passed to ax.barh()
+    xlim : tuple, default None
+        Tuple passed to axes.xlim()
+    title : str, default "Feature importance"
+        Axes title. To disable, pass None.
+    xlabel : str, default "F score"
+        X axis title label. To disable, pass None.
+    ylabel : str, default "Features"
+        Y axis title label. To disable, pass None.
+    kwargs :
+        Other keywords passed to ax.barh()
+
+    Returns
+    -------
+    ax : matplotlib Axes
+    """
+
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        raise ImportError('You must install matplotlib to plot importance')
+
+    if isinstance(booster, Booster):
+        importance = booster.get_fscore()
+    elif isinstance(booster, dict):
+        importance = booster
+    else:
+        raise ValueError('tree must be Booster or dict instance')
+
+    if len(importance) == 0:
+        raise ValueError('Booster.get_fscore() results in empty')
+
+    tuples = [(k, importance[k]) for k in importance]
+    tuples = sorted(tuples, key=lambda x: x[1])
+    labels, values = zip(*tuples)
+
+    if ax is None:
+        _, ax = plt.subplots(1, 1)
+
+    ylocs = np.arange(len(values))
+    ax.barh(ylocs, values, align='center', height=height, **kwargs)
+
+    for x, y in zip(values, ylocs):
+        ax.text(x + 1, y, x, va='center')
+
+    ax.set_yticks(ylocs)
+    ax.set_yticklabels(labels)
+
+    if xlim is not None:
+        if not isinstance(xlim, tuple) or len(xlim, 2):
+            raise ValueError('xlim must be a tuple of 2 elements')
+    else:
+        xlim = (0, max(values) * 1.1)
+    ax.set_xlim(xlim)
+
+    if title is not None:
+        ax.set_title(title)
+    if xlabel is not None:
+        ax.set_xlabel(xlabel)
+    if ylabel is not None:
+        ax.set_ylabel(ylabel)
+    ax.grid(grid)
+    return ax
+
+
+_NODEPAT = re.compile(r'(\d+):\[(.+)\]')
+_LEAFPAT = re.compile(r'(\d+):(leaf=.+)')
+_EDGEPAT = re.compile(r'yes=(\d+),no=(\d+),missing=(\d+)')
+
+
+def _parse_node(graph, text):
+    """parse dumped node"""
+    match = _NODEPAT.match(text)
+    if match is not None:
+        node = match.group(1)
+        graph.node(node, label=match.group(2), shape='circle')
+        return node
+    match = _LEAFPAT.match(text)
+    if match is not None:
+        node = match.group(1)
+        graph.node(node, label=match.group(2), shape='box')
+        return node
+    raise ValueError('Unable to parse node: {0}'.format(text))
+
+
+def _parse_edge(graph, node, text, yes_color='#0000FF', no_color='#FF0000'):
+    """parse dumped edge"""
+    match = _EDGEPAT.match(text)
+    if match is not None:
+        yes, no, missing = match.groups()
+        if yes == missing:
+            graph.edge(node, yes, label='yes, missing', color=yes_color)
+            graph.edge(node, no, label='no', color=no_color)
+        else:
+            graph.edge(node, yes, label='yes', color=yes_color)
+            graph.edge(node, no, label='no, missing', color=no_color)
+        return
+    raise ValueError('Unable to parse edge: {0}'.format(text))
+
+
+def to_graphviz(booster, num_trees=0, rankdir='UT',
+                yes_color='#0000FF', no_color='#FF0000', **kwargs):
+
+    """Convert specified tree to graphviz instance. IPython can automatically plot the
+    returned graphiz instance. Otherwise, you shoud call .render() method
+    of the returned graphiz instance.
+
+    Parameters
+    ----------
+    booster : Booster
+        Booster instance
+    num_trees : int, default 0
+        Specify the ordinal number of target tree
+    rankdir : str, default "UT"
+        Passed to graphiz via graph_attr
+    yes_color : str, default '#0000FF'
+        Edge color when meets the node condigion.
+    no_color : str, default '#FF0000'
+        Edge color when doesn't meet the node condigion.
+    kwargs :
+        Other keywords passed to graphviz graph_attr
+
+    Returns
+    -------
+    ax : matplotlib Axes
+    """
+
+    try:
+        from graphviz import Digraph
+    except ImportError:
+        raise ImportError('You must install graphviz to plot tree')
+
+    if not isinstance(booster, Booster):
+        raise ValueError('booster must be Booster instance')
+
+    tree = booster.get_dump()[num_trees]
+    tree = tree.split()
+
+    kwargs = kwargs.copy()
+    kwargs.update({'rankdir': rankdir})
+    graph = Digraph(graph_attr=kwargs)
+
+    for i, text in enumerate(tree):
+        if text[0].isdigit():
+            node = _parse_node(graph, text)
+        else:
+            if i == 0:
+                # 1st string must be node
+                raise ValueError('Unable to parse given string as tree')
+            _parse_edge(graph, node, text, yes_color=yes_color,
+                        no_color=no_color)
+
+    return graph
+
+
+def plot_tree(booster, num_trees=0, rankdir='UT', ax=None, **kwargs):
+    """Plot specified tree.
+
+    Parameters
+    ----------
+    booster : Booster
+        Booster instance
+    num_trees : int, default 0
+        Specify the ordinal number of target tree
+    rankdir : str, default "UT"
+        Passed to graphiz via graph_attr
+    ax : matplotlib Axes, default None
+        Target axes instance. If None, new figure and axes will be created.
+    kwargs :
+        Other keywords passed to to_graphviz
+
+    Returns
+    -------
+    ax : matplotlib Axes
+
+    """
+
+    try:
+        import matplotlib.pyplot as plt
+        import matplotlib.image as image
+    except ImportError:
+        raise ImportError('You must install matplotlib to plot tree')
+
+
+    if ax is None:
+        _, ax = plt.subplots(1, 1)
+
+    g = to_graphviz(booster, num_trees=num_trees, rankdir=rankdir, **kwargs)
+
+    s = BytesIO()
+    s.write(g.pipe(format='png'))
+    s.seek(0)
+    img = image.imread(s)
+
+    ax.imshow(img)
+    ax.axis('off')
+    return ax
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -319,7 +319,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
-            column_indexes = np.repeat(0, data.shape[0])
+            column_indexes = np.repeat(0, class_probs.shape[0])
            column_indexes[class_probs > 0.5] = 1
        return self._le.inverse_transform(column_indexes)

--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -118,7 +118,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                sys.stderr.write(msg + '\n')

            if evals_result is not None:
-                res = re.findall(":-([0-9.]+).", msg)
+                res = re.findall(":-?([0-9.]+).", msg)
                for key, val in zip(evals_name, res):
                    evals_result[key].append(val)

--- a/scripts/travis_osx_install.sh
+++ b/scripts/travis_osx_install.sh
@@ -5,9 +5,3 @@ if [ ${TRAVIS_OS_NAME} != "osx" ]; then
 fi

 brew update
-
-if [ ${TASK} == "python-package" ]; then
-    brew install python git
-    easy_install pip
-    pip install numpy scipy nose
-fi
--- a/scripts/travis_script.sh
+++ b/scripts/travis_script.sh
@@ -33,9 +33,44 @@ if [ ${TASK} == "R-package" ]; then
    scripts/travis_R_script.sh || exit -1
 fi

-if [ ${TASK} == "python-package" ]; then
+if [ ${TASK} == "python-package" -o ${TASK} == "python-package3" ]; then
+
+    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+        brew install graphviz
+        if [ ${TASK} == "python-package3" ]; then
+            wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+        else
+            wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh
+        fi
+    else
+        sudo apt-get install graphviz
+        if [ ${TASK} == "python-package3" ]; then
+            wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+        else
+            wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh
+        fi
+    fi
+    bash conda.sh -b -p $HOME/miniconda
+    export PATH="$HOME/miniconda/bin:$PATH"
+    hash -r
+    conda config --set always_yes yes --set changeps1 no
+    conda update -q conda
+    # Useful for debugging any issues with conda
+    conda info -a
+
+    if [ ${TASK} == "python-package3" ]; then
+        conda create -n myenv python=3.4
+    else
+        conda create -n myenv python=2.7
+    fi
+    source activate myenv
+    conda install numpy scipy matplotlib nose
+    python -m pip install graphviz
+
    make all CXX=${CXX} || exit -1
-    nosetests tests/python || exit -1
+
+    python -m nose tests/python || exit -1
+    python --version
 fi

 # only test java under linux for now
--- a/tests/README.md
+++ b/tests/README.md
@@ -1 +1 @@
-This folder contains tetstcases for xgboost.
+This folder contains testcases for xgboost.
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 import numpy as np
 import xgboost as xgb

@@ -29,3 +30,71 @@ def test_basic():
    # assert they are the same
    assert np.sum(np.abs(preds2-preds)) == 0

+def test_feature_names():
+    data = np.random.randn(100, 5)
+    target = np.array([0, 1] * 50)
+
+    cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'],
+             [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']]
+
+    for features in cases:
+        dm = xgb.DMatrix(data, label=target,
+                         feature_names=features)
+        assert dm.feature_names == features
+        assert dm.num_row() == 100
+        assert dm.num_col() == 5
+
+        params={'objective': 'multi:softprob',
+                'eval_metric': 'mlogloss',
+                'eta': 0.3,
+                'num_class': 3}
+
+        bst = xgb.train(params, dm, num_boost_round=10)
+        scores = bst.get_fscore()
+        assert list(sorted(k for k in scores)) == features
+
+
+def test_plotting():
+    bst2 = xgb.Booster(model_file='xgb.model')
+    # plotting
+
+    import matplotlib
+    matplotlib.use('Agg')
+
+    from matplotlib.axes import Axes
+    from graphviz import Digraph
+
+    ax = xgb.plot_importance(bst2)
+    assert isinstance(ax, Axes)
+    assert ax.get_title() == 'Feature importance'
+    assert ax.get_xlabel() == 'F score'
+    assert ax.get_ylabel() == 'Features'
+    assert len(ax.patches) == 4
+
+    ax = xgb.plot_importance(bst2, color='r',
+                             title='t', xlabel='x', ylabel='y')
+    assert isinstance(ax, Axes)
+    assert ax.get_title() == 't'
+    assert ax.get_xlabel() == 'x'
+    assert ax.get_ylabel() == 'y'
+    assert len(ax.patches) == 4
+    for p in ax.patches:
+        assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red
+
+
+    ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'],
+                             title=None, xlabel=None, ylabel=None)
+    assert isinstance(ax, Axes)
+    assert ax.get_title() == ''
+    assert ax.get_xlabel() == ''
+    assert ax.get_ylabel() == ''
+    assert len(ax.patches) == 4
+    assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red
+    assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red
+    assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue
+    assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue
+
+    g = xgb.to_graphviz(bst2, num_trees=0)
+    assert isinstance(g, Digraph)
+    ax = xgb.plot_tree(bst2, num_trees=0)
+    assert isinstance(ax, Axes)
--- a/tests/python/test_models.py
+++ b/tests/python/test_models.py
@@ -0,0 +1,39 @@
+import numpy as np
+import xgboost as xgb
+
+dpath = 'demo/data/'
+dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+
+def test_glm():
+	param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', 'alpha': 0.0001, 'lambda': 1 }
+	watchlist  = [(dtest,'eval'), (dtrain,'train')]
+	num_round = 4
+	bst = xgb.train(param, dtrain, num_round, watchlist)
+	assert isinstance(bst, xgb.core.Booster)
+	preds = bst.predict(dtest)
+	labels = dtest.get_label()
+	err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
+	assert err < 0.1
+
+def test_custom_objective():
+	param = {'max_depth':2, 'eta':1, 'silent':1 }
+	watchlist  = [(dtest,'eval'), (dtrain,'train')]
+	num_round = 2
+	def logregobj(preds, dtrain):
+		labels = dtrain.get_label()
+		preds = 1.0 / (1.0 + np.exp(-preds))
+		grad = preds - labels
+		hess = preds * (1.0-preds)
+		return grad, hess
+	def evalerror(preds, dtrain):
+		labels = dtrain.get_label()
+		return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+	bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
+	assert isinstance(bst, xgb.core.Booster)
+	preds = bst.predict(dtest)
+	labels = dtest.get_label()
+	err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
+	assert err < 0.1
+
+
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -435,6 +435,7 @@ int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
  *out_dptr = BeginPtr(vec);
  API_END();
 }
+
 int XGDMatrixNumRow(const DMatrixHandle handle,
                    bst_ulong *out) {
  API_BEGIN();
@@ -442,6 +443,13 @@ int XGDMatrixNumRow(const DMatrixHandle handle,
  API_END();
 }

+int XGDMatrixNumCol(const DMatrixHandle handle,
+                    bst_ulong *out) {
+  API_BEGIN();
+  *out = static_cast<size_t>(static_cast<const DataMatrix*>(handle)->info.num_col());
+  API_END();
+}
+
 // xgboost implementation
 int XGBoosterCreate(DMatrixHandle dmats[],
                    bst_ulong len,
@@ -572,3 +580,20 @@ int XGBoosterDumpModel(BoosterHandle handle,
      featmap, with_stats != 0, len);
  API_END();
 }
+
+int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
+                                   int fnum,
+                                   const char **fname,
+                                   const char **ftype,
+                                   int with_stats,
+                                   bst_ulong *len,
+                                   const char ***out_models) {
+  API_BEGIN();
+  utils::FeatMap featmap;
+  for (int i = 0; i < fnum; ++i) {
+      featmap.PushBack(i, fname[i], ftype[i]);
+  }
+  *out_models = static_cast<Booster*>(handle)->GetModelDump(
+      featmap, with_stats != 0, len);
+  API_END();
+}
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@@ -184,6 +184,13 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
 */
 XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle,
                            bst_ulong *out);
+/*!
+ * \brief get number of columns
+ * \param handle the handle to the DMatrix
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle,
+                            bst_ulong *out);
 // --- start XGBoost class
 /*!
 * \brief create xgboost learner
@@ -324,4 +331,24 @@ XGB_DLL int XGBoosterDumpModel(BoosterHandle handle,
                               int with_stats,
                               bst_ulong *out_len,
                               const char ***out_dump_array);
+
+/*!
+ * \brief dump model, return array of strings representing model dump
+ * \param handle handle
+ * \param fnum number of features
+ * \param fnum names of features
+ * \param fnum types of features
+ * \param with_stats whether to dump with statistics
+ * \param out_len length of output array
+ * \param out_dump_array pointer to hold representing dump of each model
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
+                                           int fnum,
+                                           const char **fname,
+                                           const char **ftype,
+                                           int with_stats,
+                                           bst_ulong *len,
+                                           const char ***out_models);
+
 #endif  // XGBOOST_WRAPPER_H_