실전프로젝트 5일차 - 머신러닝 모델 구축

카테고리 없음

실전프로젝트 5일차 - 머신러닝 모델 구축

iron-min 2025. 12. 8. 23:19

시계열 특강을 듣고 머신러닝 모델을 다른 방향으로 수정해보기로 했습니다.

데이터를 보면 시퀀스가 LOT 시계열로 분류되고 한개 LOT 안의 데이터가 Batch 형태로 구성되어 있는데 이를 3등분해서 구간별로 데이터를 생성해보기로 했습니다.

1. 데이터 구간분류 하기

1-1 구간분류 시각화

① 전류

N_COLS = 3

# 데이터에 존재하는 고유한 sequence_index 값들을 추출
unique_sequences = sorted(mil['sequence_index'].unique())
num_plots = len(unique_sequences)

# Subplot 레이아웃 설정
n_rows = int(np.ceil(num_plots / N_COLS))

# 전체 figure 크기 설정
fig, axes = plt.subplots(n_rows, N_COLS,
                         figsize=(N_COLS * 7, n_rows * 5),
                         sharex=False)
axes = axes.flatten()

# 각 sequence_index 별로 Line Plot
for i, seq_index in enumerate(unique_sequences):
    # 해당 sequence_index 데이터 필터링
    filtered_data = mil[mil['sequence_index'] == seq_index].copy()

    # 해당 축에 Line Plot 그리기
    sns.lineplot(
        ax=axes[i],
        data=filtered_data,
        x='pk_datetime',
        y='ampere',
        hue='failure',
        legend='full' if i == 0 else False
    )
    x_min = filtered_data['pk_datetime'].min()
    x_max = filtered_data['pk_datetime'].max()
    x_30 = x_min + (x_max - x_min) * 0.33  # 33% 위치
    x_60 = x_min + (x_max - x_min) * 0.66  # 66% 위치

    axes[i].axvline(x=x_30, linestyle='--', color='red', linewidth=1)
    axes[i].axvline(x=x_60, linestyle='--', color='red', linewidth=1)
    # ---------------------------------------------

    # 그래프 제목 및 레이블 설정
    axes[i].set_title(f'Sequence Index {seq_index} 전류 (Ampere) 추이 sequence 별', fontsize=14)
    axes[i].set_xlabel('시간 (pk_datetime)', fontsize=10)
    axes[i].set_ylabel('전류 (Ampere)', fontsize=12)

    # X축 눈금 레이블 회전 및 크기 설정
    for tick in axes[i].get_xticklabels():
        tick.set_rotation(45)
        tick.set_fontsize(10)

for j in range(num_plots, n_rows * N_COLS):
    fig.delaxes(axes[j])

# 전체 타이틀 설정 및 레이아웃 조정
fig.suptitle('Sequence Index 구간별 전류 추이 비교',
             fontsize=18, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

② 전압

N_COLS = 3

# 데이터에 존재하는 고유한 sequence_index 값들을 추출
unique_sequences = sorted(mil['sequence_index'].unique())
num_plots = len(unique_sequences)

# Subplot 레이아웃 설정
n_rows = int(np.ceil(num_plots / N_COLS))

# 전체 figure 크기 설정
fig, axes = plt.subplots(n_rows, N_COLS,
                         figsize=(N_COLS * 7, n_rows * 5),
                         sharex=False)
axes = axes.flatten()

# 각 sequence_index 별로 Line Plot
for i, seq_index in enumerate(unique_sequences):
    # 해당 sequence_index 데이터 필터링
    filtered_data = mil[mil['sequence_index'] == seq_index].copy()

    # 해당 축에 Line Plot
    sns.lineplot(
        ax=axes[i],
        data=filtered_data,
        x='pk_datetime',
        y='volt',
        hue='failure',
        legend='full' if i == 0 else False
    )
    x_min = filtered_data['pk_datetime'].min()
    x_max = filtered_data['pk_datetime'].max()
    x_30 = x_min + (x_max - x_min) * 0.33  # 33% 위치
    x_60 = x_min + (x_max - x_min) * 0.66  # 66% 위치

    axes[i].axvline(x=x_30, linestyle='--', color='red', linewidth=1)
    axes[i].axvline(x=x_60, linestyle='--', color='red', linewidth=1)
    # 그래프 제목 및 레이블 설정
    axes[i].set_title(f'Sequence Index {seq_index} 전압 (Volt) 추이 sequence별', fontsize=14)
    axes[i].set_xlabel('시간 (pk_datetime)', fontsize=10)
    axes[i].set_ylabel('전압 (Volt)', fontsize=12)

    # X축 눈금 레이블 회전 및 크기 설정
    for tick in axes[i].get_xticklabels():
        tick.set_rotation(45)
        tick.set_fontsize(10)

# 사용하지 않는 빈 subplot 숨기기
for j in range(num_plots, n_rows * N_COLS):
    fig.delaxes(axes[j])

# 전체 타이틀 설정 및 레이아웃 조정
fig.suptitle('Sequence Index 구간별 전압 추이 비교',
             fontsize=18, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

③ 온도

N_COLS = 3

# 데이터에 존재하는 고유한 sequence_index 값들을 추출
unique_sequences = sorted(mil['sequence_index'].unique())
num_plots = len(unique_sequences)

# Subplot 레이아웃 설정
n_rows = int(np.ceil(num_plots / N_COLS))

# 전체 figure 크기 설정
fig, axes = plt.subplots(n_rows, N_COLS,
                         figsize=(N_COLS * 7, n_rows * 5),
                         sharex=False)
axes = axes.flatten()

# 각 sequence_index 별로 Line Plot
for i, seq_index in enumerate(unique_sequences):
    # 해당 sequence_index 데이터 필터링
    filtered_data = mil[mil['sequence_index'] == seq_index].copy()

    # 해당 축에 Line Plot 그리기
    sns.lineplot(
        ax=axes[i],
        data=filtered_data,
        x='pk_datetime',
        y='temperature',
        hue='failure',
        legend='full' if i == 0 else False
    )
    x_min = filtered_data['pk_datetime'].min()
    x_max = filtered_data['pk_datetime'].max()
    x_30 = x_min + (x_max - x_min) * 0.33  # 33% 위치
    x_60 = x_min + (x_max - x_min) * 0.66  # 66% 위치

    axes[i].axvline(x=x_30, linestyle='--', color='red', linewidth=1)
    axes[i].axvline(x=x_60, linestyle='--', color='red', linewidth=1)

    # 그래프 제목 및 레이블 설정
    axes[i].set_title(f'Sequence Index {seq_index} 온도 추이 sequence별', fontsize=14)
    axes[i].set_xlabel('시간 (pk_datetime)', fontsize=10)
    axes[i].set_ylabel('온', fontsize=12)

    # X축 눈금 레이블 회전 및 크기 설정
    for tick in axes[i].get_xticklabels():
        tick.set_rotation(45)
        tick.set_fontsize(10)

# 사용하지 않는 빈 subplot 숨기기
for j in range(num_plots, n_rows * N_COLS):
    fig.delaxes(axes[j])

# 전체 타이틀 설정 및 레이아웃 조정
fig.suptitle('Sequence Index 구간별 온도 추이 비교',
             fontsize=18, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

1-2 피쳐링 및 데이터 분리

① 피쳐링

mil['ampere_lag1'] = mil['ampere'].shift(1)
mil['volt_lag1'] = mil['volt'].shift(1)
mil['temperature_lag1'] = mil['temperature'].shift(1)
mil['전류이동평균']=mil.groupby('sequence_index')['ampere'].rolling(window=3).mean().shift(1) .reset_index(level=0, drop=True)
mil['전압이동평균']=mil.groupby('sequence_index')['volt'].rolling(window=3).mean().shift(1) .reset_index(level=0, drop=True)
mil['온도이동평균']=mil.groupby('sequence_index')['temperature'].rolling(window=3).mean().shift(1) .reset_index(level=0, drop=True)
mil['전류이동표준편차']=mil.groupby('sequence_index')['ampere'].rolling(window=3).std().shift(1) .reset_index(level=0, drop=True)
mil['전압이동표준편차']=mil.groupby('sequence_index')['volt'].rolling(window=3).std().shift(1) .reset_index(level=0, drop=True)
mil['온도이동표준편차']=mil.groupby('sequence_index')['temperature'].rolling(window=3).std().shift(1) .reset_index(level=0, drop=True)
mil['△전류']=mil['ampere'].diff()
mil['△전압']=mil['volt'].diff()
mil['△온도']=mil['temperature'].diff()

② 구간분류

sequence_area = mil.groupby('sequence_index')
def split_into_tertiles(group):
    # 시퀀스에서 인덱스를 기준으로 3등분
    n = len(group)
    group = group.sort_index()  # 혹은 시간 순 정렬
    group['tertile'] = pd.qcut(np.arange(n), 3, labels=[0, 1, 2])
    return group

df_tertile = sequence_area.apply(split_into_tertiles).reset_index(drop=True)

# 3. 구간별 집계 (평균)
구간별_데이터 = df_tertile.groupby(['sequence_index', 'tertile']).mean().reset_index()

2. 구간별 머신러닝 모델 설계

① 머신러닝 준비단계

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import RocCurveDisplay, roc_auc_score, roc_curve, auc

features_to_use = [
    'volt','ampere','temperature','ampere_lag1',
    'volt_lag1','temperature_lag1','전류이동평균','전압이동평균','온도이동평균',
    '전류이동표준편차','전압이동표준편차','온도이동표준편차','△전류','△전압','△온도','failure','두께변화량','tertile'
    ]

mil_머신 = 구간별_데이터[features_to_use]
mil_머신= mil_머신.dropna()  # NaN 값 처리하기 위함

# X (독립변수)와 y (종속변수) 분리
X = mil_머신.drop('failure', axis=1)
y = mil_머신['failure']

# 훈련/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y,         # 클래스 비율 유지
    shuffle=True
)

scoring = {
    'precision': make_scorer(precision_score),   # pos_label=1 (기본값)
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc'
}

② 랜덤포레스트

# 파이프라인 설계 및 교차 검증
rf_pipe = Pipeline([
    ('smote', SMOTE(k_neighbors=2, random_state=42)),
    ('clf', RandomForestClassifier(
        n_estimators=200,      # 트리 개수
        max_depth=None,        # 깊이 제한 없음 (필요하면 숫자로 제한 가능)
        min_samples_split=10,  # 노드 분할 최소 샘플 수
        min_samples_leaf=5,    # 리프 노드 최소 샘플 수
        n_jobs=-1,             # 코어 모두 사용
        random_state=42
    ))
])

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

rf_cv_results = cross_validate(
    rf_pipe,
    X, y,
    cv=cv,
    n_jobs=1,
    scoring=scoring
)

# train 데이터로 RF 파이프라인 학습
rf_pipe.fit(X_train, y_train)

# test.train 데이터 예측
y_pred_rf_test = rf_pipe.predict(X_test)
y_pred_rf_train = rf_pipe.predict(X_train)

# 성능 평가
print(classification_report(y_test, y_pred_rf_test))
print("Confusion Matrix_test:")
print(confusion_matrix(y_test, y_pred_rf_test))

print(classification_report(y_train, y_pred_rf_train))
print("Confusion Matrix_train:")
print(confusion_matrix(y_train, y_pred_rf_train))

# 특성 중요도
rf_model= rf_pipe['clf']
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(7, 7))

mean_fpr = np.linspace(0, 1, 100)
tprs = []
aucs = []

for i, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    # 기존 rf_pipe를 그대로 쓰면 같은 객체를 계속 재학습하게 되니 clone으로 복제
    pipe_cv = clone(rf_pipe)

    # fold별 train/valid 데이터 나누기
    X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
    y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx]

    # 학습
    pipe_cv.fit(X_train_cv, y_train_cv)

    # 검증 세트에 대한 예측 확률 (양성 클래스=1 기준)
    y_proba_cv = pipe_cv.predict_proba(X_val_cv)[:, 1]

    # ROC curve 계산
    fpr, tpr, _ = roc_curve(y_val_cv, y_proba_cv)
    roc_auc = auc(fpr, tpr)

    # 각 fold ROC 그리기
    ax.plot(fpr, tpr, lw=1, alpha=0.7,
            label=f'Fold {i+1} ROC (AUC = {roc_auc:.3f})')

    # mean ROC 계산을 위한 보간
    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.0
    tprs.append(tpr_interp)
    aucs.append(roc_auc)

# 평균 ROC
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)

ax.plot(mean_fpr, mean_tpr, lw=2, color='black',
        label=f'Mean ROC (AUC = {mean_auc:.3f})')

# 기준선 (랜덤 분류기)
ax.plot([0, 1], [0, 1], linestyle='--', lw=1, color='grey', label='Random')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('RandomForest + SMOTE 5-fold ROC Curve')
ax.legend(loc='lower right')
plt.show()

③ Decision Tree

# 파이프라인 설계 및 교차 검증 (Decision Tree 버전)
dt_pipe = Pipeline([
    ('smote', SMOTE(k_neighbors=2, random_state=42)),
    ('clf', DecisionTreeClassifier(
        max_depth=None,        # 필요하면 예: 5, 10 등으로 제한 가능
        min_samples_split=10,  # 노드 분할 최소 샘플 수
        min_samples_leaf=5,    # 리프 노드 최소 샘플 수
        random_state=42
    ))
])

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

dt_cv_results = cross_validate(
    dt_pipe,
    X, y,
    cv=cv,
    n_jobs=1,
    scoring=scoring
)

# train 데이터로 DT 파이프라인 학습
dt_pipe.fit(X_train, y_train)

# test/train 데이터 예측
y_pred_dt_test = dt_pipe.predict(X_test)
y_pred_dt_train = dt_pipe.predict(X_train)

# 성능 평가
print("=== Decision Tree - Test ===")
print(classification_report(y_test, y_pred_dt_test))
print("Confusion Matrix_test:")
print(confusion_matrix(y_test, y_pred_dt_test))

print("\n=== Decision Tree - Train ===")
print(classification_report(y_train, y_pred_dt_train))
print("Confusion Matrix_train:")
print(confusion_matrix(y_train, y_pred_dt_train))

# 특성 중요도
dt_model = dt_pipe['clf']
feature_importances = pd.Series(dt_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(7, 7))

mean_fpr = np.linspace(0, 1, 100)
tprs = []
aucs = []

roc_auc_per_fold = []

for i, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
    y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx]

    # 이 fold의 y_val에 클래스가 하나뿐이면 ROC 계산 불가 → 스킵
    if y_val_cv.nunique() < 2:
        print(f"Fold {i+1}: 검증 데이터에 한 클래스만 있어 ROC 계산 불가 → 스킵")
        continue

    # 파이프라인 복제해서 학습 (여기서 dt_pipe 사용)
    pipe_cv = clone(dt_pipe)
    pipe_cv.fit(X_train_cv, y_train_cv)

    # 양성 클래스(1) 확률
    y_proba_cv = pipe_cv.predict_proba(X_val_cv)[:, 1]

    # ROC curve
    fpr, tpr, _ = roc_curve(y_val_cv, y_proba_cv)
    roc_auc = auc(fpr, tpr)
    roc_auc_per_fold.append(roc_auc)

    ax.plot(fpr, tpr, lw=1, alpha=0.7,
            label=f'Fold {i+1} ROC (AUC = {roc_auc:.3f})')

    # mean ROC 계산을 위한 보간
    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.0
    tprs.append(tpr_interp)
    aucs.append(roc_auc)

# 평균 ROC (스킵되지 않은 fold만 사용)
if len(tprs) > 0:
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)

    ax.plot(mean_fpr, mean_tpr, lw=2, color='black',
            label=f'Mean ROC (AUC = {mean_auc:.3f})')

# 랜덤 분류 기준선
ax.plot([0, 1], [0, 1], linestyle='--', lw=1, color='grey', label='Random')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Decision Tree + SMOTE 5-fold ROC Curve')
ax.legend(loc='lower right')
plt.show()

# fold별 AUC 값도 같이 출력
roc_auc_per_fold = np.array(roc_auc_per_fold)
print("roc_auc 각 fold 점수:", roc_auc_per_fold)
print(f"roc_auc 평균: {roc_auc_per_fold.mean():.4f}")
print("-" * 40)

④ XGBoost

# y 라벨 재코딩: 0/1 이진 분류
y_xgb = (y == 1.0).astype(int)

# 파이프라인 설계
xgb_pipe = Pipeline([
    ('smote', SMOTE(k_neighbors=2, random_state=42)),
    ('clf', XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective='binary:logistic',
        eval_metric='logloss',
        n_jobs=-1,
        random_state=42,
        enable_categorical=True,
        tree_method='hist'
    ))
])

# Stratified K-Fold & scoring 정의
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

# 교차검증
xgb_cv_results = cross_validate(
    xgb_pipe,
    X, y_xgb,
    cv=cv,
    scoring=scoring,
    return_train_score=False,
    n_jobs=1
)

# train/test 분리 (y_xgb 사용!)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_xgb,
    test_size=0.2,
    random_state=42,
    stratify=y_xgb
)

# 파이프라인 학습
xgb_pipe.fit(X_train, y_train)

# 예측
y_pred_train = xgb_pipe.predict(X_train)
y_pred_test = xgb_pipe.predict(X_test)

# 성능 평가
print("=== Train ===")
print(classification_report(y_train, y_pred_train))
print("Confusion Matrix (Train):")
print(confusion_matrix(y_train, y_pred_train))

print("\n=== Test ===")
print(classification_report(y_test, y_pred_test))
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_pred_test))

# feature importance 확인
xgb_model= xgb_pipe['clf']
feature_importances = pd.Series(xgb_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(7, 7))

mean_fpr = np.linspace(0, 1, 100)
tprs = []
aucs = []

roc_auc_per_fold = []

for i, (train_idx, val_idx) in enumerate(cv.split(X, y_xgb)):
    X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
    y_train_cv, y_val_cv = y_xgb.iloc[train_idx], y_xgb.iloc[val_idx]

    # 이 fold의 y_val에 클래스가 하나뿐이면 ROC 계산 불가 → 스킵
    if y_val_cv.nunique() < 2:
        print(f"Fold {i+1}: 검증 데이터에 한 클래스만 있어 ROC 계산 불가 → 스킵")
        continue

    # 파이프라인 복제해서 학습
    pipe_cv = clone(xgb_pipe)
    pipe_cv.fit(X_train_cv, y_train_cv)

    # 양성 클래스(1) 확률
    y_proba_cv = pipe_cv.predict_proba(X_val_cv)[:, 1]

    # ROC curve
    fpr, tpr, _ = roc_curve(y_val_cv, y_proba_cv)
    roc_auc = auc(fpr, tpr)
    roc_auc_per_fold.append(roc_auc)

    ax.plot(fpr, tpr, lw=1, alpha=0.7,
            label=f'Fold {i+1} ROC (AUC = {roc_auc:.3f})')

    # mean ROC 계산을 위한 보간
    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.0
    tprs.append(tpr_interp)
    aucs.append(roc_auc)

# 평균 ROC (스킵되지 않은 fold만 사용)
if len(tprs) > 0:
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)

    ax.plot(mean_fpr, mean_tpr, lw=2, color='black',
            label=f'Mean ROC (AUC = {mean_auc:.3f})')

# 랜덤 분류 기준선
ax.plot([0, 1], [0, 1], linestyle='--', lw=1, color='grey', label='Random')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('XGBoost + SMOTE 5-fold ROC Curve')
ax.legend(loc='lower right')
plt.show()

⑤ Light GBM

y_lgb = (y == 1.0).astype(int)

lgb_pipe = Pipeline([
    ('smote', SMOTE(k_neighbors=2, random_state=42)),
    ('clf', LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=5,          # 제한 없음 (필요하면 숫자로 제한)
        subsample=0.8,         # bagging_fraction
        colsample_bytree=0.8,  # feature_fraction
        reg_lambda=1.0,
        objective='binary',
        n_jobs=-1,
        random_state=42,
        verbose=-1
    ))
])

# 교차 검증 설계
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

# 점수 정의 교차검정 실행
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': 'accuracy'
}

lgb_cv_results = cross_validate(
    lgb_pipe,
    X, y_lgb,
    cv=cv,
    scoring=scoring,
    return_train_score=False,
    n_jobs=1
)

# 모델 실행 및 예측
X_train, X_test, y_train_raw, y_test_raw = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

y_train = (y_train_raw == 1.0).astype(int)
y_test = (y_test_raw == 1.0).astype(int)

# 1) train 데이터로 LGBM 파이프라인 학습
lgb_pipe.fit(X_train, y_train)

# 2) test 데이터 예측
y_pred_lgb_test = lgb_pipe.predict(X_test)
y_pred_lgb_train = lgb_pipe.predict(X_train)

# 성능 평가
print(classification_report(y_test, y_pred_lgb_test))
print("Confusion Matrix_test:")
print(confusion_matrix(y_test, y_pred_lgb_test))
print(classification_report(y_train, y_pred_lgb_train))
print("Confusion Matrix_train:")
print(confusion_matrix(y_train, y_pred_lgb_train))
lgb_model= lgb_pipe['clf']
feature_importances = pd.Series(lgb_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(7, 7))

mean_fpr = np.linspace(0, 1, 100)
tprs = []
aucs = []

roc_auc_per_fold = []

for i, (train_idx, val_idx) in enumerate(cv.split(X, y_lgb)):
    X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
    y_train_cv, y_val_cv = y_lgb.iloc[train_idx], y_lgb.iloc[val_idx]

    # 이 fold의 y_val에 클래스가 하나뿐이면 ROC 계산 불가 → 스킵
    if y_val_cv.nunique() < 2:
        print(f"Fold {i+1}: 검증 데이터에 한 클래스만 있어 ROC 계산 불가 → 스킵")
        continue

    # 파이프라인 복제해서 학습
    pipe_cv = clone(lgb_pipe)
    pipe_cv.fit(X_train_cv, y_train_cv)

    # 양성 클래스(1) 확률
    y_proba_cv = pipe_cv.predict_proba(X_val_cv)[:, 1]

    # ROC curve
    fpr, tpr, _ = roc_curve(y_val_cv, y_proba_cv)
    roc_auc = auc(fpr, tpr)
    roc_auc_per_fold.append(roc_auc)

    ax.plot(fpr, tpr, lw=1, alpha=0.7,
            label=f'Fold {i+1} ROC (AUC = {roc_auc:.3f})')

    # mean ROC 계산을 위한 보간
    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.0
    tprs.append(tpr_interp)
    aucs.append(roc_auc)

# 평균 ROC (스킵되지 않은 fold만 사용)
if len(tprs) > 0:
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)

    ax.plot(mean_fpr, mean_tpr, lw=2, color='black',
            label=f'Mean ROC (AUC = {mean_auc:.3f})')

# 랜덤 분류 기준선
ax.plot([0, 1], [0, 1], linestyle='--', lw=1, color='grey', label='Random')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('LightGBM + SMOTE 5-fold ROC Curve')
ax.legend(loc='lower right')
plt.show()

# fold별 AUC 값 출력
roc_auc_per_fold = np.array(roc_auc_per_fold)
print("roc_auc 각 fold 점수:", roc_auc_per_fold)
print(f"roc_auc 평균: {roc_auc_per_fold.mean():.4f}")
print("-" * 40)

3. Light GBM 모델 하이퍼파라미터 튜닝

위 모델들 중에서 Light GBM 모델이 가장 예측률이 좋아서 파라미터를 수정해 더 좋은 모델을 만들어 보았습니다.

일단 기존 Light GBM의 문제는 불량을 양성으로 예측하는 경우가 생각보다 많아서 FP를 줄이고 FN을 어느정도 감수하는 방향으로 파라미터를 수정하기로 했습니다.

y_lgb = (y == 1.0).astype(int)

lgb_pipe = Pipeline([
    ('smote', SMOTE(k_neighbors=2, random_state=42)),
    ('clf', LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=35,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_samples=30,
        reg_lambda=5.0,
        reg_alpha=2.0,
        objective='binary',
        n_jobs=-1,
        random_state=42,
        verbose=-1,
        class_weight={0: 2.0, 1: 1.0}
    ))
])

# 교차 검증 설계
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

# 점수 정의 교차검정 실행
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': 'accuracy'
}

lgb_cv_results = cross_validate(
    lgb_pipe,
    X, y_lgb,
    cv=cv,
    scoring=scoring,
    return_train_score=False,
    n_jobs=1
)

# 모델 실행 및 예측
X_train, X_test, y_train_raw, y_test_raw = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

y_train = (y_train_raw == 1.0).astype(int)
y_test = (y_test_raw == 1.0).astype(int)

# 1) train 데이터로 LGBM 파이프라인 학습
lgb_pipe.fit(X_train, y_train)

# 2) test 데이터 예측
y_pred_lgb_test = lgb_pipe.predict(X_test)
y_pred_lgb_train = lgb_pipe.predict(X_train)

# 성능 평가
print(classification_report(y_test, y_pred_lgb_test))
print("Confusion Matrix_test:")
print(confusion_matrix(y_test, y_pred_lgb_test))
print(classification_report(y_train, y_pred_lgb_train))
print("Confusion Matrix_train:")
print(confusion_matrix(y_train, y_pred_lgb_train))
lgb_model= lgb_pipe['clf']
feature_importances = pd.Series(lgb_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False)

기존 2개였던 FP 값이 1로 줄어든 모습입니다.

그리고 train 데이터의 과적합이 어느정도 해소되었습니다.

# 5-fold ROC curve & AUC 계산 (LightGBM + SMOTE)
fig, ax = plt.subplots(figsize=(7, 7))

mean_fpr = np.linspace(0, 1, 100)
tprs = []
aucs = []

roc_auc_per_fold = []

for i, (train_idx, val_idx) in enumerate(cv.split(X, y_lgb)):
    X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
    y_train_cv, y_val_cv = y_lgb.iloc[train_idx], y_lgb.iloc[val_idx]

    # 이 fold의 y_val에 클래스가 하나뿐이면 ROC 계산 불가 → 스킵
    if y_val_cv.nunique() < 2:
        print(f"Fold {i+1}: 검증 데이터에 한 클래스만 있어 ROC 계산 불가 → 스킵")
        continue

    # 파이프라인 복제해서 학습
    pipe_cv = clone(lgb_pipe)
    pipe_cv.fit(X_train_cv, y_train_cv)

    # 양성 클래스(1) 확률
    y_proba_cv = pipe_cv.predict_proba(X_val_cv)[:, 1]

    # ROC curve
    fpr, tpr, _ = roc_curve(y_val_cv, y_proba_cv)
    roc_auc = auc(fpr, tpr)
    roc_auc_per_fold.append(roc_auc)

    ax.plot(fpr, tpr, lw=1, alpha=0.7,
            label=f'Fold {i+1} ROC (AUC = {roc_auc:.3f})')

    # mean ROC 계산을 위한 보간
    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.0
    tprs.append(tpr_interp)
    aucs.append(roc_auc)

# 평균 ROC (스킵되지 않은 fold만 사용)
if len(tprs) > 0:
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)

    ax.plot(mean_fpr, mean_tpr, lw=2, color='black',
            label=f'Mean ROC (AUC = {mean_auc:.3f})')

# 랜덤 분류 기준선
ax.plot([0, 1], [0, 1], linestyle='--', lw=1, color='grey', label='Random')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('LightGBM + SMOTE 5-fold ROC Curve')
ax.legend(loc='lower right')
plt.show()

# fold별 AUC 값 출력
roc_auc_per_fold = np.array(roc_auc_per_fold)
print("roc_auc 각 fold 점수:", roc_auc_per_fold)
print(f"roc_auc 평균: {roc_auc_per_fold.mean():.4f}")
print("-" * 40)