import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_theme(style="whitegrid", palette="muted")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc

import tensorflow as tf
from tensorflow import keras

!apt-get -qq install fonts-nanum

'apt-get' is not recognized as an internal or external command,
operable program or batch file.

from matplotlib import rc
rc('font', family='NanumGothic') # 그래프에 한글 출력시 폰트 지정해야 함

# SPSS 데이터 불러오기
hn19 = pd.read_csv("HN19_lec3.csv")
hn19.head(n = 5)

hn19 = hn19.assign(
    bmi=hn19['HE_BMI'],
    hp=np.where(hn19['HE_HP'].isin([2,3]), 1, 0),
    diabetes=np.where(hn19['HE_DM_HbA1c'].isin([2,3]), 1, 0)
).dropna(subset=['bmi','hp','diabetes'])[['age','bmi','hp','diabetes']]
hn19.tail(3)

corr = hn19[['age','bmi','hp','diabetes']].corr()

sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title("변수 간 상관계수 Heatmap")
plt.show()

y_jitter = hn19['diabetes'] + np.random.uniform(-0.05, 0.05, size=len(hn19))

sns.scatterplot(x='bmi', y=y_jitter, data=hn19, alpha=0.4)
plt.title("BMI vs diabetes")
plt.show()

model_logistic_2d = LogisticRegression(max_iter=1000)
model_logistic_2d.fit(hn19[['age','bmi']], hn19['diabetes'])

LogisticRegression(max_iter=1000)

LogisticRegression(max_iter=1000)

# grid 생성
age_seq = np.linspace(hn19['age'].min(), hn19['age'].max(), 100)
bmi_seq = np.linspace(hn19['bmi'].min(), hn19['bmi'].max(), 100)
A, B = np.meshgrid(age_seq, bmi_seq)

grid = pd.DataFrame({'age': A.ravel(), 'bmi': B.ravel()})
pred_logistic_2d = model_logistic_2d.predict_proba(grid)[:, 1].reshape(A.shape)

contour = plt.contourf(A, B, pred_logistic_2d, levels=50, cmap='RdBu', alpha=0.7)
plt.colorbar(contour, label="P(당뇨병=1)")
plt.xlabel("나이(Age)")
plt.ylabel("BMI")
plt.title("로지스틱 회귀의 당뇨병 예측 확률")
plt.show()

model_logistic = LogisticRegression(max_iter=1000)
model_logistic.fit(hn19[['age','bmi','hp']], hn19['diabetes'])
print("회귀계수:", model_logistic.coef_, "절편:", model_logistic.intercept_)

회귀계수: [[0.05787047 0.15102873 0.34019309]] 절편: [-6.74213772]

model_tree_2d = DecisionTreeClassifier(max_depth=2, random_state=2025)
model_tree_2d.fit(hn19[['age','bmi']], hn19['diabetes'])

plt.figure(figsize=(10,5))
plot_tree(model_tree_2d, feature_names=['age','bmi'], class_names=['0','1'], filled=True)
plt.show()

# 2. 결정나무 예측 확률 계산
pred_tree_2d = model_tree_2d.predict_proba(grid)[:, 1].reshape(A.shape)

# 3. 시각화
plt.figure(figsize=(8, 6))
contour = plt.contourf(A, B, pred_tree_2d, levels=50, cmap='RdBu', alpha=0.7)
plt.colorbar(contour, label="P(당뇨병=1)")
plt.xlabel("나이(Age)")
plt.ylabel("BMI")
plt.title("결정 나무의 영역별 당뇨병 예측 확률")
plt.show()

# 가지치기 정도를 alpha(=cp)로 지정 (값이 클수록 많이 가지치기)
model_ptree = DecisionTreeClassifier(random_state=42, ccp_alpha=0.01)
model_ptree.fit(hn19[['age','bmi', 'hp']].iloc[0:100], hn19['diabetes'].iloc[0:100])

# 시각화
plt.figure(figsize=(10, 6))
plot_tree(model_ptree, feature_names=['age','bmi','hp'], class_names=['0','1'], filled=True)
plt.title("Pruned Decision Tree (ccp_alpha=0.01)")
plt.show()

model_bag = RandomForestClassifier(n_estimators=100, max_features=3, random_state=42)
model_bag.fit(hn19[['age','bmi', 'hp']], hn19['diabetes'])

RandomForestClassifier(max_features=3, random_state=42)

RandomForestClassifier(max_features=3, random_state=42)

model_rf = RandomForestClassifier(n_estimators=100, max_features=2, random_state=2025)
model_rf.fit(hn19[['age','bmi', 'hp']], hn19['diabetes'])

RandomForestClassifier(max_features=2, random_state=2025)

RandomForestClassifier(max_features=2, random_state=2025)

X_nn = hn19[['age','bmi','hp']].iloc[0:100].values
y_nn = hn19['diabetes'].iloc[0:100].values

model_nn = keras.Sequential([
    keras.layers.Dense(2, activation='relu', input_shape=(3,)),
    keras.layers.Dense(1, activation='sigmoid')
])
model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_nn.fit(X_nn, y_nn, epochs=50, verbose=0)
print("신경망 정확도:", model_nn.evaluate(X_nn, y_nn, verbose=0)[1])

C:\Users\rupik\anaconda3\Lib\site-packages\keras\src\layers\core\dense.py:92: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

신경망 정확도: 0.49000000953674316

train, test = train_test_split(hn19, test_size=0.9, random_state=42)
X_train, y_train = train[['age','bmi','hp']], train['diabetes']
X_test, y_test = test[['age','bmi','hp']], test['diabetes']

def calc_metrics(thresholds, actual, predicted_prob):
    results = []
    for t in thresholds:
        # 예측 확률이 threshold보다 크면 1(양성), 작으면 0(음성)
        pred_class = (predicted_prob > t).astype(int)

        # 혼동 행렬 요소 계산
        TP = np.sum((pred_class == 1) & (actual == 1))
        TN = np.sum((pred_class == 0) & (actual == 0))
        FP = np.sum((pred_class == 1) & (actual == 0))
        FN = np.sum((pred_class == 0) & (actual == 1))

        # 평가 지표 계산
        accuracy = (TP + TN) / (TP + TN + FP + FN)
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        
        results.append((t, accuracy, sensitivity, specificity))
    return pd.DataFrame(results, columns=["Threshold","Accuracy","Sensitivity","Specificity"])

thresholds = np.arange(0, 1.01, 0.01)
id_05 = np.where(thresholds == 0.5)

model_logistic = LogisticRegression(max_iter=1000)
model_logistic.fit(X_train, y_train)
pred_logistic = model_logistic.predict_proba(X_test)[:,1]

metrics_logistic = calc_metrics(thresholds, y_test.values, pred_logistic)
# threshold = 0.5 정확도
acc_05 = metrics_logistic[["Accuracy"]].values[id_05][0][0]
print("Logistic Accuracy: ", acc_05)

Logistic Accuracy:  0.7511227002752426

metrics_long = metrics_logistic.melt(id_vars="Threshold", 
                               value_vars=["Accuracy","Sensitivity","Specificity"], 
                               var_name="Metric", value_name="Value")

plt.figure(figsize=(8,6))
sns.lineplot(data=metrics_long, x="Threshold", y="Value", hue="Metric", linewidth=1.5)
plt.xlabel("Threshold")
plt.ylabel("Metric Value")
plt.title("Accuracy / Sensitivity / Specificity")
plt.legend(title="Metric")
plt.show()

fpr, tpr, _ = roc_curve(y_test, pred_logistic)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

model_logistic_bmi = LogisticRegression(max_iter=1000)
model_logistic_bmi.fit(hn19[['bmi']], hn19['diabetes'])

LogisticRegression(max_iter=1000)

LogisticRegression(max_iter=1000)

grid_bmi = pd.DataFrame({'bmi': np.linspace(hn19['bmi'].min(), hn19['bmi'].max(), 100)})
pred_logistic = model_logistic_bmi.predict_proba(grid_bmi)[:, 1]

y_jitter = hn19['diabetes'] + np.random.uniform(-0.05, 0.05, size=len(hn19))

sns.scatterplot(x='bmi', y=y_jitter, data=hn19, alpha=0.4)
sns.lineplot(x=grid_bmi['bmi'], y=pred_logistic)
plt.title("BMI vs diabetes")
plt.show()

model_tree_bmi = DecisionTreeClassifier(max_depth=2, random_state=2025)
model_tree_bmi.fit(hn19[['bmi']], hn19['diabetes'])

DecisionTreeClassifier(max_depth=2, random_state=2025)

DecisionTreeClassifier(max_depth=2, random_state=2025)

pred_tree = model_tree_bmi.predict_proba(grid_bmi)[:, 1]

y_jitter = hn19['diabetes'] + np.random.uniform(-0.05, 0.05, size=len(hn19))

sns.scatterplot(x='bmi', y=y_jitter, data=hn19, alpha=0.4)
sns.lineplot(x=grid_bmi['bmi'], y=pred_tree)
plt.title("BMI vs diabetes")
plt.show()

model_bag_2d = RandomForestClassifier(n_estimators=100, max_features=2, random_state=2025)
model_bag_2d.fit(hn19[['age','bmi']], hn19['diabetes'])

RandomForestClassifier(max_features=2, random_state=2025)

RandomForestClassifier(max_features=2, random_state=2025)

# grid 생성
age_seq = np.linspace(hn19['age'].min(), hn19['age'].max(), 100)
bmi_seq = np.linspace(hn19['bmi'].min(), hn19['bmi'].max(), 100)
A, B = np.meshgrid(age_seq, bmi_seq)

grid = pd.DataFrame({'age': A.ravel(), 'bmi': B.ravel()})
pred_bag_2d = model_bag_2d.predict_proba(grid)[:, 1].reshape(A.shape)

contour = plt.contourf(A, B, pred_bag_2d, levels=50, cmap='RdBu', alpha=0.7)
plt.colorbar(contour, label="P(당뇨병=1)")
plt.xlabel("나이(Age)")
plt.ylabel("BMI")
plt.title("배깅의 당뇨병 예측 확률")
plt.show()

model_rf_2d = RandomForestClassifier(n_estimators=100, max_features=1, random_state=2025)
model_rf_2d.fit(hn19[['age','bmi']], hn19['diabetes'])

RandomForestClassifier(max_features=1, random_state=2025)

RandomForestClassifier(max_features=1, random_state=2025)

# grid 생성
age_seq = np.linspace(hn19['age'].min(), hn19['age'].max(), 100)
bmi_seq = np.linspace(hn19['bmi'].min(), hn19['bmi'].max(), 100)
A, B = np.meshgrid(age_seq, bmi_seq)

grid = pd.DataFrame({'age': A.ravel(), 'bmi': B.ravel()})
pred_rf_2d = model_rf_2d.predict_proba(grid)[:, 1].reshape(A.shape)

contour = plt.contourf(A, B, pred_rf_2d, levels=50, cmap='RdBu', alpha=0.7)
plt.colorbar(contour, label="P(당뇨병=1)")
plt.xlabel("나이(Age)")
plt.ylabel("BMI")
plt.title("랜덤 포레스트의 당뇨병 예측 확률")
plt.show()

X_nn = hn19[['age','bmi']].iloc[0:100].values
y_nn = hn19['diabetes'].iloc[0:100].values

model_nn_2d = keras.Sequential([
    keras.layers.Dense(2, activation='relu', input_shape=(2,)),
    keras.layers.Dense(1, activation='sigmoid')
])
model_nn_2d.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_nn_2d.fit(X_nn, y_nn, epochs=50, verbose=0)
print("신경망 정확도:", model_nn_2d.evaluate(X_nn, y_nn, verbose=0)[1])

C:\Users\rupik\anaconda3\Lib\site-packages\keras\src\layers\core\dense.py:92: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

신경망 정확도: 0.6499999761581421

# grid 생성
age_seq = np.linspace(hn19['age'].min(), hn19['age'].max(), 100)
bmi_seq = np.linspace(hn19['bmi'].min(), hn19['bmi'].max(), 100)
A, B = np.meshgrid(age_seq, bmi_seq)

grid = pd.DataFrame({'age': A.ravel(), 'bmi': B.ravel()})
pred_nn_2d = model_nn_2d.predict(grid).reshape(A.shape)

contour = plt.contourf(A, B, pred_nn_2d, levels=50, cmap='RdBu', alpha=0.7)
plt.colorbar(contour, label="P(당뇨병=1)")
plt.xlabel("나이(Age)")
plt.ylabel("BMI")
plt.title("인공 신경망의 당뇨병 예측 확률")
plt.show()

313/313 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step

train, test = train_test_split(hn19, test_size=0.9, random_state=2025)
X_train, y_train = train[['age','bmi','hp']], train['diabetes']
X_test, y_test = test[['age','bmi','hp']], test['diabetes']

def calc_metrics(thresholds, actual, predicted_prob):
    results = []
    for t in thresholds:
        # 예측 확률이 threshold보다 크면 1(양성), 작으면 0(음성)
        pred_class = (predicted_prob > t).astype(int)

        # 혼동 행렬 요소 계산
        TP = np.sum((pred_class == 1) & (actual == 1))
        TN = np.sum((pred_class == 0) & (actual == 0))
        FP = np.sum((pred_class == 1) & (actual == 0))
        FN = np.sum((pred_class == 0) & (actual == 1))

        # 평가 지표 계산
        accuracy = (TP + TN) / (TP + TN + FP + FN)
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        
        results.append((t, accuracy, sensitivity, specificity))
    return pd.DataFrame(results, columns=["Threshold","Accuracy","Sensitivity","Specificity"])

def plot_metrics(thresholds, actual, predicted):
    metric = calc_metrics(thresholds, actual, predicted)

    metrics_long = metric.melt(id_vars="Threshold", 
                               value_vars=["Accuracy","Sensitivity","Specificity"], 
                               var_name="Metric", value_name="Value")

    plt.figure(figsize=(8,6))
    sns.lineplot(data=metrics_long, x="Threshold", y="Value", hue="Metric", linewidth=1.5)
    plt.xlabel("Threshold")
    plt.ylabel("Metric Value")
    plt.title("Accuracy / Sensitivity / Specificity")
    plt.legend(title="Metric")
    plt.show()

def plot_roc(actual, predicted):
    fpr, tpr, _ = roc_curve(actual, predicted)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.show()

thresholds = np.arange(0, 1.01, 0.01)

model_tree = DecisionTreeClassifier(max_depth=2, random_state=2025)
model_tree.fit(X_train, y_train)
pred_tree = model_tree.predict_proba(X_test)[:, 1]

plot_metrics(thresholds, y_test, pred_tree)

plot_roc(y_test, pred_tree)

model_bag = RandomForestClassifier(n_estimators=100, max_features=3, random_state=2025)
model_bag.fit(X_train, y_train)
pred_bag = model_bag.predict_proba(X_test)[:, 1]

plot_metrics(thresholds, y_test, pred_bag)

plot_roc(y_test, pred_bag)

model_rf = RandomForestClassifier(n_estimators=100, max_features=2, random_state=2025)
model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict_proba(X_test)[:, 1]

plot_metrics(thresholds, y_test, pred_rf)

plot_roc(y_test, pred_rf)

model_bag = RandomForestClassifier(n_estimators=100, max_features=3, random_state=2025)
model_bag.fit(X_train, y_train)
pred_bag = model_bag.predict_proba(X_test)[:, 1]

plot_metrics(thresholds, y_test, pred_bag)

plot_roc(y_test, pred_bag)

model_nn = keras.Sequential([
    keras.layers.Dense(2, activation='relu', input_shape=(3,)),
    keras.layers.Dense(1, activation='sigmoid')
])
model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_nn.fit(X_train, y_train, epochs=50, verbose=0)
pred_nn = model_nn.predict(X_test)[:, 0]

C:\Users\rupik\anaconda3\Lib\site-packages\keras\src\layers\core\dense.py:92: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

216/216 ━━━━━━━━━━━━━━━━━━━━ 2s 6ms/step

plot_metrics(thresholds, y_test, pred_nn)

plot_roc(y_test, pred_nn)

모델링 2 (Python)¶

실습¶

1. 분석을 위해 필요한 라이브러리 및 데이터 불러오기¶

2. 변수 생성 및 결측 처리¶

3. 상관관계 분석 및 시각화¶

4. BMI와 당뇨병 관계 시각화¶

5. 로지스틱 회귀 (2변수 모형) 및 예측 확률 시각화¶

6. 로지스틱 회귀 (다변량) 및 잔차 진단¶

7. 결정 나무 (2변수와 다변량 모델 + 가지치기)¶

8. 앙상블 모형 (배깅 & 랜덤 포레스트)¶

9. 신경망 모형 (Neural Network)¶

10. 모델 평가 (데이터 분리, Threshold 성능 분석, ROC Curve)¶

연습 문제¶

1. 로지스틱 회귀 모형¶

풀이¶

2. 결정나무¶

풀이¶

3. 배깅¶

풀이¶

4. 랜덤 포레스트¶

풀이¶

5. 인공 신경망¶

풀이¶

6. 모델 평가¶

풀이¶

	age	HE_BMI	HE_HP	HE_DM_HbA1c
0	61	25.987394	3.0	3.0
1	28	16.900942	1.0	1.0
2	53	19.781829	2.0	1.0
3	50	26.631647	1.0	2.0
4	16	NaN	NaN	NaN

	age	bmi
8107	43	15.375148
8108	16	14.844970
8109	8	14.528000