import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#load dataset
cancer_dataset = load_breast_cancer()

# cancer_dataset 그냥 찍어보니 이상하게 나옴.. dataframe화 해줘야 헸음. 
cancer_dataset_df = pd.DataFrame(cancer_dataset.data, columns=cancer_dataset.feature_names)
cancer_dataset_df.head()

# dataset split
X_train, X_test, y_train, y_test = train_test_split(cancer_dataset.data, cancer_dataset.target, test_size=0.2, random_state=121)

#weak learners: logistic regression, KNN
logistic_regression = LogisticRegression()
KNN = KNeighborsClassifier()

#votinng ensemble with these two weak learners
voting_ensemble = VotingClassifier(estimators=[("LogisticRegression", logistic_regression), ("KNN", KNN)],
                                  voting = 'soft')

# voting_ensemble model train/val/test
voting_ensemble.fit(X_train, y_train)
y_pred = voting_ensemble.predict(X_test)

print("voting 분류기 정확도 {0:.4f}".format(accuracy_score(y_test, y_pred)))

voting 분류기 정확도 0.9649

/Users/jeonghyeonjeong/opt/anaconda3/envs/pythonML/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

cancer_dataset = load_breast_cancer()

cancer_dataset_df = pd.DataFrame(cancer_dataset.data, columns=cancer_dataset.feature_names)
cancer_dataset_df.head()

# dataset split
X_train, X_test, y_train, y_test = train_test_split(cancer_dataset.data, cancer_dataset.target, test_size=0.2, random_state=121)

bagging_rf_ensemble = RandomForestClassifier()

bagging_rf_ensemble.fit(X_train, y_train)
y_pred = bagging_rf_ensemble.predict(X_test)

print("bagging RandomForest 분류기 정확도 {0:.4f}".format(accuracy_score(y_test, y_pred)))

bagging RandomForest 분류기 정확도 0.9825

# random forest 의 hyper paramter 튜닝 - hyper parameter가 너무 많아 수동으로 하나하나 시도해보며 찾아주기가 어렵다. 
# 그래서 GridSearchCV를 이용한 후 , 최적의 것을 찾아주고 이를 이용하는 순서로 진행해본다. 
params = {
    'n_estimators': [100], #100개의 sub Dataset, Decision Tree
    'max_depth': [6, 8, 10, 12],
    'min_samples_leaf': [8, 12, 18],
    'min_samples_split': [8, 16, 20]    
    
}

bagging_rf_ensemble = RandomForestClassifier(random_state=0, n_jobs=-1) # n_jobs=-1로 하면 모든 cpu 코어를 이용하여 학습 가능
grid_cv = GridSearchCV(bagging_rf_ensemble, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
             n_jobs=-1,
             param_grid={'max_depth': [6, 8, 10, 12],
                         'min_samples_leaf': [8, 12, 18],
                         'min_samples_split': [8, 16, 20],
                         'n_estimators': [100]})

print("최적의 하이퍼 파라미터: ", grid_cv.best_params_)
print("최고의 예측 정확도: ", grid_cv.best_score_)

최적의 하이퍼 파라미터:  {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}
최고의 예측 정확도:  0.9472621531803076

#일단 이렇게 최적의 하이퍼 파라미터를 찾을 수 있다.
# 최적의 하이퍼 파라미터를 알았으니, 직관적으로 더 이점이 될 것같은 파라미터만 변경해주고 나머지는 이에 맞춰서 RandomForestClassifier를 다시 한번 학습시킨다. 

final_bagging_rf_ensemble = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_leaf=8, 
                                                   min_samples_split=8, random_state=0)

final_bagging_rf_ensemble.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, min_samples_split=8,
                       random_state=0)

y_pred = final_bagging_rf_ensemble.predict(X_test)
print("예측 정확도:{0:.4f} ".format(accuracy_score(y_test, y_pred)))

예측 정확도:0.9737

#feature 중요도도 그려볼 수 있다. 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

feature_importances_values = final_bagging_rf_ensemble.feature_importances_
feature_importances = pd.Series(feature_importances_values, index=cancer_dataset_df.columns)
feature_importances_top20 = feature_importances.sort_values(ascending=False)[:20] #중요한 순서로 정렬 후 top 20개만 뽑아 시각화

plt.figure(figsize=(8, 6))
plt.title('Feature Importances Top 20')
sns.barplot(x=feature_importances_top20, y=feature_importances_top20.index)
plt.show()

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

Classification 4. 앙상블 학습(Ensemble Learning) - Boosting(3. XGBoost) (0)	2020.10.10
Classification 3. 앙상블 학습(Ensemble Learning) - Boosting (1. AdaBoost, 2. GBM) (0)	2020.10.10
Classification 1. 결정 트리(Decision Tree) (0)	2020.10.08
Regression2. 경사하강법 수행 프로세스와 python code (0)	2020.10.01
Evaluation2. 회귀의 성능 평가 지표(MAE, MSE, RMSE, R제곱) (0)	2020.09.28

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

Grace's Tech Blog

Classification 2. 앙상블 학습(Ensemble Learning) - Voting과 Bagging

앙상블 학습이란,

앙상블의 특징

앙상블의 유형

1) Voting

2) Bagging

'Data Science > Machine Learning' 카테고리의 다른 글

'Data Science/Machine Learning'의 다른글

티스토리툴바

개인정보

단축키

내 블로그

블로그 게시글

모든 영역

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

Classification 2. 앙상블 학습(Ensemble Learning) - Voting과 Bagging

앙상블 학습이란,

앙상블의 특징

앙상블의 유형

1) Voting

2) Bagging

'Data Science > Machine Learning' 카테고리의 다른 글

'Data Science/Machine Learning'의 다른글

관련글

티스토리툴바

개인정보

단축키

내 블로그

블로그 게시글

모든 영역