import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

cancer_dataset = load_breast_cancer()

cancer_dataset_df = pd.DataFrame(cancer_dataset.data, columns=cancer_dataset.feature_names)
cancer_dataset_df.head()

# dataset split
X_train, X_test, y_train, y_test = train_test_split(cancer_dataset.data, cancer_dataset.target, test_size=0.2, random_state=121)

boosting_gbm_ensemble = GradientBoostingClassifier()

boosting_gbm_ensemble.fit(X_train, y_train)
y_pred = boosting_gbm_ensemble.predict(X_test)

print("boosting GBM 분류기 정확도 {0:.4f}".format(accuracy_score(y_test, y_pred)))

boosting GBM 분류기 정확도 0.9825

# GBM 의 hyper paramter 튜닝 - hyper parameter가 너무 많아 수동으로 하나하나 시도해보며 찾아주기가 어렵다. 
# 그래서 GridSearchCV를 이용한 후 , 최적의 것을 찾아주고 이를 이용하는 순서로 진행해본다. 
params = {
    'n_estimators': [100, 300, 500], #100개의 sub Dataset, Decision Tree
    'max_depth': [6, 8, 10, 12],
    'min_samples_leaf': [8, 12, 18],
    'min_samples_split': [8, 16, 20],
    'learning_rate': [0.05, 0.1] #GBM의 weak learner가 순차적으로 오류 ㄱ밧ㅅ을 보정해 나가는데 적용하는 계수 
}

grid_cv = GridSearchCV(boosting_gbm_ensemble, param_grid=params, cv=2, verbose=1)
grid_cv.fit(X_train, y_train)

Fitting 2 folds for each of 216 candidates, totalling 432 fits

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:  6.9min finished

GridSearchCV(cv=2, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.05, 0.1],
                         'max_depth': [6, 8, 10, 12],
                         'min_samples_leaf': [8, 12, 18],
                         'min_samples_split': [8, 16, 20],
                         'n_estimators': [100, 300, 500]},
             verbose=1)

print("최적의 하이퍼 파라미터: ", grid_cv.best_params_)
print("최고의 예측 정확도: ", grid_cv.best_score_)

최적의 하이퍼 파라미터:  {'learning_rate': 0.1, 'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}
최고의 예측 정확도:  0.9604393693484814

#일단 이렇게 최적의 하이퍼 파라미터를 찾을 수 있다.
# 최적의 하이퍼 파라미터를 알았으니, 최적으로 학습 된 것을 가지고 예측을 수행한다. 
y_pred = grid_cv.best_estimator_.predict(X_test)
print("boosting GBM 정확도: {0:.4f}".format(accuracy_score(y_pred, y_test)))

boosting GBM 정확도: 1.0000

#feature 중요도도 그려볼 수 있다. 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

feature_importances_values = grid_cv.best_estimator_.feature_importances_
feature_importances = pd.Series(feature_importances_values, index=cancer_dataset_df.columns)
feature_importances_top20 = feature_importances.sort_values(ascending=False)[:20] #중요한 순서로 정렬 후 top 20개만 뽑아 시각화

plt.figure(figsize=(8, 6))
plt.title('Feature Importances Top 20')
sns.barplot(x=feature_importances_top20, y=feature_importances_top20.index)
plt.show()

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

Classification 4. 앙상블 학습(Ensemble Learning) - Boosting(3. XGBoost) (0)	2020.10.10
Classification 2. 앙상블 학습(Ensemble Learning) - Voting과 Bagging (0)	2020.10.09
Classification 1. 결정 트리(Decision Tree) (0)	2020.10.08
Regression2. 경사하강법 수행 프로세스와 python code (0)	2020.10.01
Evaluation2. 회귀의 성능 평가 지표(MAE, MSE, RMSE, R제곱) (0)	2020.09.28

Grace's Tech Blog

Classification 3. 앙상블 학습(Ensemble Learning) - Boosting (1. AdaBoost, 2. GBM)

3) Boosting

1. AdaBoost

2. GBM(Gradient Boost Machine)

Reference

'Data Science > Machine Learning' 카테고리의 다른 글

'Data Science/Machine Learning'의 다른글

티스토리툴바

Classification 3. 앙상블 학습(Ensemble Learning) - Boosting (1. AdaBoost, 2. GBM)

3) Boosting

1. AdaBoost

2. GBM(Gradient Boost Machine)

Reference

'Data Science > Machine Learning' 카테고리의 다른 글

'Data Science/Machine Learning'의 다른글

관련글

티스토리툴바