Use black on more Python files. (#8137)
This commit is contained in:
@@ -19,13 +19,14 @@ Also, see the tutorial for using XGBoost with categorical data:
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from time import time
|
||||
|
||||
import os
|
||||
from tempfile import TemporaryDirectory
|
||||
from time import time
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
|
||||
@@ -16,11 +16,13 @@ categorical data.
|
||||
.. versionadded:: 1.5.0
|
||||
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
|
||||
def make_categorical(
|
||||
n_samples: int, n_features: int, n_categories: int, onehot: bool
|
||||
|
||||
@@ -1,35 +1,34 @@
|
||||
'''
|
||||
"""
|
||||
Collection of examples for using xgboost.spark estimator interface
|
||||
==================================================================
|
||||
|
||||
@author: Weichen Xu
|
||||
'''
|
||||
"""
|
||||
import sklearn.datasets
|
||||
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
|
||||
from pyspark.ml.linalg import Vectors
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import rand
|
||||
from pyspark.ml.linalg import Vectors
|
||||
import sklearn.datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
|
||||
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
|
||||
|
||||
|
||||
spark = SparkSession.builder.master("local[*]").getOrCreate()
|
||||
|
||||
|
||||
def create_spark_df(X, y):
|
||||
return spark.createDataFrame(
|
||||
spark.sparkContext.parallelize([
|
||||
(Vectors.dense(features), float(label))
|
||||
for features, label in zip(X, y)
|
||||
]),
|
||||
["features", "label"]
|
||||
spark.sparkContext.parallelize(
|
||||
[(Vectors.dense(features), float(label)) for features, label in zip(X, y)]
|
||||
),
|
||||
["features", "label"],
|
||||
)
|
||||
|
||||
|
||||
# load diabetes dataset (regression dataset)
|
||||
diabetes_X, diabetes_y = sklearn.datasets.load_diabetes(return_X_y=True)
|
||||
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = \
|
||||
train_test_split(diabetes_X, diabetes_y, test_size=0.3, shuffle=True)
|
||||
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(
|
||||
diabetes_X, diabetes_y, test_size=0.3, shuffle=True
|
||||
)
|
||||
|
||||
diabetes_train_spark_df = create_spark_df(diabetes_X_train, diabetes_y_train)
|
||||
diabetes_test_spark_df = create_spark_df(diabetes_X_test, diabetes_y_test)
|
||||
@@ -38,25 +37,36 @@ diabetes_test_spark_df = create_spark_df(diabetes_X_test, diabetes_y_test)
|
||||
xgb_regressor = SparkXGBRegressor(max_depth=5)
|
||||
xgb_regressor_model = xgb_regressor.fit(diabetes_train_spark_df)
|
||||
|
||||
transformed_diabetes_test_spark_df = xgb_regressor_model.transform(diabetes_test_spark_df)
|
||||
transformed_diabetes_test_spark_df = xgb_regressor_model.transform(
|
||||
diabetes_test_spark_df
|
||||
)
|
||||
regressor_evaluator = RegressionEvaluator(metricName="rmse")
|
||||
print(f"regressor rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df)}")
|
||||
print(
|
||||
f"regressor rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df)}"
|
||||
)
|
||||
|
||||
diabetes_train_spark_df2 = diabetes_train_spark_df.withColumn(
|
||||
"validationIndicatorCol", rand(1) > 0.7
|
||||
)
|
||||
|
||||
# train xgboost regressor model with validation dataset
|
||||
xgb_regressor2 = SparkXGBRegressor(max_depth=5, validation_indicator_col="validationIndicatorCol")
|
||||
xgb_regressor2 = SparkXGBRegressor(
|
||||
max_depth=5, validation_indicator_col="validationIndicatorCol"
|
||||
)
|
||||
xgb_regressor_model2 = xgb_regressor2.fit(diabetes_train_spark_df2)
|
||||
transformed_diabetes_test_spark_df2 = xgb_regressor_model2.transform(diabetes_test_spark_df)
|
||||
print(f"regressor2 rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df2)}")
|
||||
transformed_diabetes_test_spark_df2 = xgb_regressor_model2.transform(
|
||||
diabetes_test_spark_df
|
||||
)
|
||||
print(
|
||||
f"regressor2 rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df2)}"
|
||||
)
|
||||
|
||||
|
||||
# load iris dataset (classification dataset)
|
||||
iris_X, iris_y = sklearn.datasets.load_iris(return_X_y=True)
|
||||
iris_X_train, iris_X_test, iris_y_train, iris_y_test = \
|
||||
train_test_split(iris_X, iris_y, test_size=0.3, shuffle=True)
|
||||
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(
|
||||
iris_X, iris_y, test_size=0.3, shuffle=True
|
||||
)
|
||||
|
||||
iris_train_spark_df = create_spark_df(iris_X_train, iris_y_train)
|
||||
iris_test_spark_df = create_spark_df(iris_X_test, iris_y_test)
|
||||
@@ -74,9 +84,13 @@ iris_train_spark_df2 = iris_train_spark_df.withColumn(
|
||||
)
|
||||
|
||||
# train xgboost classifier model with validation dataset
|
||||
xgb_classifier2 = SparkXGBClassifier(max_depth=5, validation_indicator_col="validationIndicatorCol")
|
||||
xgb_classifier2 = SparkXGBClassifier(
|
||||
max_depth=5, validation_indicator_col="validationIndicatorCol"
|
||||
)
|
||||
xgb_classifier_model2 = xgb_classifier2.fit(iris_train_spark_df2)
|
||||
transformed_iris_test_spark_df2 = xgb_classifier_model2.transform(iris_test_spark_df)
|
||||
print(f"classifier2 f1={classifier_evaluator.evaluate(transformed_iris_test_spark_df2)}")
|
||||
print(
|
||||
f"classifier2 f1={classifier_evaluator.evaluate(transformed_iris_test_spark_df2)}"
|
||||
)
|
||||
|
||||
spark.stop()
|
||||
|
||||
Reference in New Issue
Block a user