cd C:\path\to\smote
python -m venv .venv
.\.venv\Scripts\Activate.ps1
python -m pip install --upgrade pip

pip install -U pandas lightgbm scikit-learn seaborn matplotlib imbalanced-learn catboost ipykernel notebook

python -m ipykernel install --user --name smote-venv --display-name "Python (smote-venv)"

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

data=pd.read_csv("./creditcard.csv")
data.tail()

data=data.drop(['Time','Amount'],axis=1)
data.tail()

# 데이터 내 NA값 여부 확인
data.isnull() # NA 확인
data.isnull().any() # 만약 존재한다면 0으로 대체 혹은, 해당 열을 제외하고 진행

data.shape

data['V1']

data_feature = data.drop(['Class'], axis=1)

col_names = data_feature.columns

col_names

data_feature = data_feature[:10]

data_feature

data_feature['V1'].min()

data_feature

plt.figure(figsize=(20, 6))
y = data_feature.iloc[0]
x = np.arange(len(y))
plt.bar(x, y)
labels = data_feature.columns
plt.title("Value Distribution of the First Row")
plt.xticks(x, labels, rotation=0)
plt.xlabel('Feature Name')
plt.ylabel('Feature Values')
plt.tight_layout()

plt.rcParams["figure.figsize"] = (15,10)
plt.plot()
for x in range(data_feature.shape[0]):
    y = data_feature.iloc[x]
    x = np.arange(len(y))
    #plt.scatter(x, y)
    plt.plot(x, y)
labels = data_feature.columns
plt.title("Value Distribution of 10 Rows")
plt.xticks(x, labels)
plt.xlabel('Feature Name')
plt.ylabel('Feature Values')

data.describe()

# 타겟 변수(feature)의 분포 확인
from collections import Counter
class_count = Counter(data.Class)
print(class_count)
IR = class_count.get(0) / class_count.get(1)
print(f'IR (Class Imbalance Ratio): {IR:.2f}')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot') # Using ggplot2 style visuals
f, ax = plt.subplots(figsize = (11, 15)) # 그래프 사이즈

ax.set_facecolor('#fafafa') # 그래프 색상값
ax.set(xlim = (-5, 5)) # X축 범위
#ax.set(xlim = (-150, 150)) # X축 범위
plt.ylabel('Variables') # Y축 이름
plt.title("Overview Data Set") # 그래프 제목
ax = sns.boxplot(
    data = data.drop(columns = ['Class']), # V1 ~ V28 확인
    orient = 'h',
    palette = 'Set2'
)

data.loc[data['Class'] == 1] # class가 1인 Row 추출
# 개수 확인: 492 rows -> class imbalenced 상황

var = data.columns.values[:-1] # V1 ~ V28
i = 0
t0 = data.loc[data['Class'] == 0] # Class : 0 인 행만 추출 --> 정상 거리
t1 = data.loc[data['Class'] == 1] # Class : 1 인 행만 추출 --> Fraud 거래

sns.set_style('whitegrid') # 그래프 스타일 지정
plt.figure()
fig, ax = plt.subplots(8, 4, figsize = (16, 28)) # 축 지정

# KDE Plot
# Kernel Desntiy Estimation is a non-parametric estimation of a PDF,
# Probability Density Function of a random variable.
# It's a smoothing process of a discontinuous PDF

for idx, feature in enumerate(var):
#     i += 1
    plt.subplot(7, 4, idx+1) # 28개의 그래프
    sns.kdeplot(t0[feature], bw_method = 0.5, label = "Class = 0")
    sns.kdeplot(t1[feature], bw_method = 0.5, label = "Class = 1")
    plt.xlabel(feature, fontsize = 12) # 라벨 속성값
    plt.xticks()
    plt.legend()
plt.show()

X = np.array(data.iloc[:, data.columns != 'Class'])
y = np.array(data.iloc[:, data.columns == 'Class'])
X.shape, y.shape

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

def model_evaluation(label, predict):
    cf_matrix = confusion_matrix(label, predict)
    Accuracy = (cf_matrix[0][0] + cf_matrix[1][1]) / sum(sum(cf_matrix))
    Precision = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[0][1])
    Recall = cf_matrix[1][1] / (cf_matrix[1][1] + cf_matrix[1][0])
    F1_Score = (2 * Recall * Precision) / (Recall + Precision)
    print("Model_Evaluation with Label:1")
    print(f"Accuracy: {Accuracy:1.5f}")
    print(f"Precision: {Precision: 1.5f}")
    print(f"Recall: { Recall: 1.5f}")
    print(f"F1-Score: {F1_Score: 1.5f}")

lgb_dtrain = lgb.Dataset(data = pd.DataFrame(X_train), label = pd.DataFrame(y_train)) # 학습 데이터를 LightGBM 모델에 맞게 변환

lgb_param = {
    'max_depth': 10, # 트리 깊이
    'learning_rate': 0.01, # Step Size
    'n_estimators': 50, # Number of trees, 트리 생성 개수
    'objective': 'binary',
    'seed': 0, # 재현성 고정
    'verbosity': -1, # 학습 로그 출력 끄기
}

 # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain) # 학습 진행
pred= np.repeat(0, len(y_test))
pred[lgb_model.predict(X_test) > 0.5] = 1
model_evaluation(y_test, pred)

from imblearn.over_sampling import SMOTE
num_minor_class_before = sum(y_train == 1)[0]
print(f"SMOTE 수행 이전 label '1' 개수: {num_minor_class_before}") # y_train 중 레이블 값이 1인 데이터의 개수
print(f"SMOTE 수행 이전 label '0' 개수: {sum(y_train==0)[0]:,} \n") # y_train 중 레이블 값이 0 인 데이터의 개수

sm = SMOTE(random_state=0, sampling_strategy=0.3) # SMOTE 알고리즘, 비율 증가
# numpy raval() 함수: 1차원 배열로 변환, 참고 블로그 -> https://m.blog.naver.com/wideeyed/221533365486
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel()) # Over Sampling 진행

# 결과 출력
num_minor_class_after = sum(y_train_res==1)
print(f"SMOTE 수행 결과 label '1' 개수: {num_minor_class_after:,}")
print(f"SMOTE 수행 결과 label '0' 개수': {sum(y_train_res==0):,}")
print(f"Minor Class 증가: {num_minor_class_after - num_minor_class_before:,}")

# 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_dtrain2 = lgb.Dataset(data = pd.DataFrame(X_train_res), label = pd.DataFrame(y_train_res))

lgb_param2 = {
    'max_depth': 10, # 트리 깊이
    'learning_rate': 0.01, # Step Size
    'n_estimators': 50, # Number of trees, 트리 생성 개수
    'objective': 'multiclass', # 목적 함수
    'seed': 0, # 재현성 고정
    'num_class': len(set(pd.DataFrame(y_train_res))) + 1 # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
} 
lgb_model2 = lgb.train(params = lgb_param2, train_set = lgb_dtrain2) # 학습 진행
lgb_model2_predict = np.argmax(lgb_model2.predict(X_test), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
model_evaluation(y_test, lgb_model2_predict) # 모델 분류 평가 결과

print(f"SMOTE 수행 이전 label '1' 개수: {sum(y_train == 1)[0]:,}") # y_train 중 레이블 값이 1인 데이터의 개수
print(f"SMOTE 수행 이전 label '0' 개수: {sum(y_train == 0)[0]:,} \n") # y_train 중 레이블 값이 0 인 데이터의 개수

sm2 = SMOTE(random_state = 0, sampling_strategy=0.6) # SMOTE 알고리즘, 비율 60%
X_train_res2, y_train_res2 = sm2.fit_resample(X_train, y_train.ravel()) # Over Sampling 진행

print(f"SMOTE 수행 결과 label '1' 개수: {sum(y_train_res2==1):,}")
print(f"SMOTE 수행 결과 label '0' 개수: {sum(y_train_res2==0):,}")
print(f"Minor Class 증가: {sum(y_train_res2==1) - sum(y_train == 1)[0]:,}")

lgb_dtrain3 = lgb.Dataset(data = pd.DataFrame(X_train_res2), label = pd.DataFrame(y_train_res2))

lgb_param3 = {
    'max_depth': 10, # 트리 깊이
    'learning_rate': 0.01, # Step Size
    'n_estimators': 50, # Number of trees, 트리 생성 개수
    'objective': 'multiclass', # 목적 함수
    'seed': 0, # 재현성 고정
    'num_class': len(set(pd.DataFrame(y_train_res2))) + 1 # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
} 

lgb_model3 = lgb.train(params = lgb_param3, train_set = lgb_dtrain3) # 학습 진행
lgb_model3_predict = np.argmax(lgb_model3.predict(X_test), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
model_evaluation(y_test, lgb_model3_predict) # 모델 분류 평가 결과

print(f"SMOTE 수행 이전 label '1' 개수: {sum(y_train == 1)[0]:,}") # y_train 중 레이블 값이 1인 데이터의 개수
print(f"SMOTE 수행 이전 label '0' 개수: {sum(y_train == 0)[0]:,} \n") # y_train 중 레이블 값이 0 인 데이터의 개수

sm3 = SMOTE(random_state = 0) # SMOTE 알고리즘, Default: 동등
X_train_res3, y_train_res3 = sm3.fit_resample(X_train, y_train.ravel()) # Over Sampling 진행

print(f"SMOTE 수행 결과 label '1' 개수: {sum(y_train_res3==1):,}")
print(f"SMOTE 수행 결과 label '0' 개수: {sum(y_train_res3==0):,}")
print(f"Minor Class 증가: {sum(y_train_res3==1) - sum(y_train == 1)[0]:,}")

lgb_dtrain4 = lgb.Dataset(data = pd.DataFrame(X_train_res3), label = pd.DataFrame(y_train_res3))

lgb_param4 = {
    'max_depth': 10, # 트리 깊이
    'learning_rate': 0.01, # Step Size
    'n_estimators': 50, # Number of trees, 트리 생성 개수
    'objective': 'multiclass', # 목적 함수
    'seed': 0, # 재현성 고정
    'num_class': len(set(pd.DataFrame(y_train_res3))) + 1 # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
} 
lgb_model4 = lgb.train(params = lgb_param4, train_set = lgb_dtrain4) # 학습 진행
lgb_model4_predict = np.argmax(lgb_model4.predict(X_test), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
model_evaluation(y_test, lgb_model4_predict) # 모델 분류 평가 결과

# Compare Precision/Recall/F1-Score for No SMOTE, 30%, 60%, 100% SMOTE
def calc_metrics(label, predict):
    cf = confusion_matrix(label, predict)
    precision = cf[1][1] / (cf[1][1] + cf[0][1]) if (cf[1][1] + cf[0][1]) != 0 else 0
    recall = cf[1][1] / (cf[1][1] + cf[1][0]) if (cf[1][1] + cf[1][0]) != 0 else 0
    f1 = (2 * recall * precision) / (recall + precision) if (recall + precision) != 0 else 0
    return [precision, recall, f1]

metric_names = ["Precision", "Recall", "F1-Score"]
ratio_names = ["No SMOTE", "30%", "60%", "100%"]

# No SMOTE prediction variable: pred
# SMOTE 30/60/100 prediction variables: lgb_model2_predict, lgb_model3_predict, lgb_model4_predict
scores_no = calc_metrics(y_test, pred)
scores_30 = calc_metrics(y_test, lgb_model2_predict)
scores_60 = calc_metrics(y_test, lgb_model3_predict)
scores_100 = calc_metrics(y_test, lgb_model4_predict)

# rows: metric, cols: ratio
scores = np.array([scores_no, scores_30, scores_60, scores_100]).T

x = np.arange(len(metric_names))
width = 0.2
colors = ["#7F7F7F", "#4C72B0", "#55A868", "#C44E52"]

plt.figure(figsize=(11, 6))
for i, ratio in enumerate(ratio_names):
    bars = plt.bar(x + (i - 1.5) * width, scores[:, i], width=width, label=ratio, color=colors[i])
    for bar in bars:
        h = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            h + 0.01,
            f"{h:.2f}",
            ha="center",
            va="bottom",
            fontsize=9
        )

plt.xticks(x, metric_names)
plt.ylim(0, 1.05)
plt.xlabel("Metric")
plt.ylabel("Score")
plt.title("Performance Comparison by SMOTE Ratio (No SMOTE vs 30% vs 60% vs 100%)")
plt.legend(title="SMOTE Ratio")
plt.tight_layout()
plt.show()

# BLSM (Borderline SMOTE)
from imblearn.over_sampling import BorderlineSMOTE
sm4 = BorderlineSMOTE(random_state = 0, sampling_strategy = 0.3) # BLSM 알고리즘 적용
X_train_res4, y_train_res4 = sm4.fit_resample(X_train, y_train.ravel()) # Over Sampling 적용

lgb_dtrain5 = lgb.Dataset(data = pd.DataFrame(X_train_res4), label = pd.DataFrame(y_train_res4)) # 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_param5 = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'num_leaves': 64,
            'objective': 'multiclass', # 목적 함수
            'seed': 0, # 재현성 고정
            'num_class': len(set(pd.DataFrame(y_train_res4))) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
lgb_model5 = lgb.train(params = lgb_param5, train_set = lgb_dtrain5) # 학습 진행
lgb_model5_predict = np.argmax(lgb_model5.predict(X_test), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
model_evaluation(y_test, lgb_model5_predict) # 모델 분류 평가 결과

# BLSM을 이용해서 Oversampling 한 학습 데이터 셋 : X_train_res2, y_train_res2
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C = 1e+10, random_state = 0)
# sklearn 의 Logistic Regression은 기본적으로 Ridge 정규화가 포함되어 있기 때문에,
# 정규화 텀을 억제하는 C를 크게 적용한다 (C:Inverse of regularization strength)
lr_model.fit(X_train_res, y_train_res) # 로지스틱 회귀 모형 학습
lr_predict = lr_model.predict(X_test) # 학습 결과를 바탕으로 검증 데이터를 예측
model_evaluation(y_test, lr_predict) # 모델 분류 평가 결과

# Random Forest
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(
    n_estimators = 50, # 50번 추정
    max_depth = 10, # 트리 최대 깊이 10
    random_state = 0  # 시드값 고정
)
rf_model = random_forest_model.fit(X_train_res, y_train_res) # 학습 진행
rf_predict = rf_model.predict(X_test) # 평가 데이터 예측
model_evaluation(y_test, rf_predict) # 모델 분류 평가 결과

# AdaBoost
ada_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=2, random_state=0), # 약한 학습기
    learning_rate=0.01, # 학습률
    n_estimators=50, # 추정기 개수
    random_state=0 # 시드값 고정
)
ada_model.fit(X_train_res, y_train_res) # 학습 진행
ada_predict = ada_model.predict(X_test) # 평가 데이터 예측
model_evaluation(y_test, ada_predict) # 모델 분류 평가 결과

# CatBoost
import catboost as cb
cb_dtrain = cb.Pool(data = X_train_res, label = y_train_res) # 학습 데이터를 Catboost 모델에 맞게 변환
cb_param = {
    'max_depth': 10, # 트리 깊이
    'learning_rate': 0.01, # Step Size
    'n_estimators': 50, # Number of trees, 트리 생성 개수
    'eval_metric': 'Accuracy', # 평가 척도
    'loss_function': 'MultiClass', # 손실 함수, 목적 함수
    'random_seed': 0, # 재현성 고정
    'verbose': False # 로그 출력 끄기
}
cb_model = cb.train(pool = cb_dtrain, params = cb_param) # 학습 진행
cb_model_predict = np.argmax(cb_model.predict(X_test), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측, 인덱스의 순서를 맞추기 위해 +1
model_evaluation(y_test, cb_model_predict) # 모델 분류 평가 결과

def compute_metrics(label, predict):
    cf = confusion_matrix(label, predict)
    accuracy = (cf[0][0] + cf[1][1]) / cf.sum()
    precision = cf[1][1] / (cf[1][1] + cf[0][1]) if (cf[1][1] + cf[0][1]) != 0 else 0
    recall = cf[1][1] / (cf[1][1] + cf[1][0]) if (cf[1][1] + cf[1][0]) != 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    return [accuracy, precision, recall, f1]

# ada_predict가 먼저 생성되어 있어야 함
model_names = ["Logistic Regression", "RandomForest", "AdaBoost", "CatBoost", "LightGBM"]
model_preds = [lr_predict, rf_predict, ada_predict, cb_model_predict, lgb_model2_predict]
metric_names = ["Accuracy", "Precision", "Recall", "F1-Score"]

scores = np.array([compute_metrics(y_test, pred) for pred in model_preds])

x = np.arange(len(model_names))
width = 0.18

plt.figure(figsize=(13, 6))
for i, metric in enumerate(metric_names):
    bars = plt.bar(x + (i - 1.5) * width, scores[:, i], width=width, label=metric)
    for bar in bars:
        h = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, h + 0.01, f"{h:.2f}", ha="center", va="bottom", fontsize=8)

plt.xticks(x, model_names, rotation=10)
plt.ylim(0, 1.08)
plt.xlabel("Model")
plt.ylabel("Score")
plt.title("Model Performance Comparison")
plt.legend()
plt.tight_layout()
plt.show()

bagging_predict_result = [] # 빈 리스트 생성
number_of_bagging = 5 # Bagging 횟수
np.random.seed(0) # 재현성 고정
for idx in range(number_of_bagging):
    data_index = [data_index for data_index in range(X_train_res.shape[0])]
    random_data_index = np.random.choice(data_index, X_train_res.shape[0])
    random_forest_model2 = RandomForestClassifier(
        n_estimators=50, # 50번 추정
        max_depth=10, # 트리 최대 깊이 10
        random_state=0, # 시드값 고정
        verbose=0  # 0: 출력 없음, 1 이상: 학습 로그 출력
    )
    # 학습 진행
    rf_model2 = random_forest_model2.fit(
        X = pd.DataFrame(X_train_res).iloc[random_data_index, :],
        y = pd.DataFrame(y_train_res).iloc[random_data_index, 0].to_numpy() # 1D로 변경
    ) 
    rf_predict2 = rf_model2.predict(X_test) # 평가 데이터 예측
    bagging_predict_result.append(rf_predict2) # 예측 결과를 bagging_predict_result에 저장
    print(f"\n{idx + 1} Model Evaluation Result:") # 전체적인 성능 평가
    model_evaluation(y_test, rf_predict2) # 모델 분류 평가 결과

bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(X_test.shape[0]): # 테스트 데이터 개수만큼 반복
    temp_predict = [] # 반복문 내 임시 빈 리스트 생성
    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 개수 만큼 반복
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    if np.mean(temp_predict) >= 0.5: # 0, 1 이진분류이므로, 예측값의 평균이 0.5보다 크면 1, 아니면 0으로 예측 다수결)
        bagging_predict.append(1)
    elif np.mean(temp_predict) < 0.5: # 예측값의 평균이 0.5보다 낮으면 0으로 결과 저장
        bagging_predict.append(0)
model_evaluation(y_test, bagging_predict) # 모델 분류 평가 결과

# F1-score comparison:
def f1_from_pred(label, pred):
    cf = confusion_matrix(label, pred)
    precision = cf[1][1] / (cf[1][1] + cf[0][1]) if (cf[1][1] + cf[0][1]) != 0 else 0
    recall = cf[1][1] / (cf[1][1] + cf[1][0]) if (cf[1][1] + cf[1][0]) != 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    return f1

# ada_predict가 먼저 생성되어 있어야 함
model_names = [
    "Logistic Regression",
    "RandomForest",
    "AdaBoost",
    "CatBoost",
    "LightGBM",
    "Bagging",
]
preds = [lr_predict, rf_predict, ada_predict, cb_model_predict, lgb_model2_predict, bagging_predict]

f1_scores = [f1_from_pred(y_test, p) for p in preds]

plt.figure(figsize=(11, 5))
bars = plt.bar(model_names, f1_scores, color=["#4C72B0", "#55A868", "#DD8452", "#C44E52", "#8172B3", "#CCB974"])
plt.ylim(0, 1.0)
plt.xlabel("Model")
plt.ylabel("F1-Score")
plt.title("F1-Score Comparison Across Models")
plt.xticks(rotation=15)

for bar, score in zip(bars, f1_scores):
    plt.text(bar.get_x() + bar.get_width() / 2, score + 0.015, f"{score:.2f}", ha="center")

plt.tight_layout()
plt.show()

실습 전 환경 준비 (가상환경 + 커널 등록)¶

1) 가상환경 생성 및 활성화 (OS별)¶

Windows (PowerShell)¶

Linux (bash)¶

macOS (zsh/bash)¶

2) 실습 패키지 설치¶

3) ipykernel 등록¶

실습 데이터 - Credit Card Fraud Detection Data¶

불필요한 데이터 제거¶

데이터 구조 확인¶

데이터 분포 형태 확인¶

데이터 기본 통계 확인 (Summary)¶

collection 패키지의 Counter 모듈을 이용해서 클래스 비율 확인¶

EDA (Exploratory Data Analysis)¶

각 변수별 특성 시각화 및 분석¶

차트 해석 방법 (Box Plot)¶

각 변수 별 그래프를 타겟변수에 대해서 그려보았을 경우, 차이가 있는 변수들은 다음과 같이 정의할 수 있다.¶

데이터 구조 재확인¶

학습데이터와 평가 데이터 분리¶

모델 성능평가 함수 작성¶

Light GBM을 기본 모델로 설정¶

LightGBM 데이터셋 객체 생성¶

LightGBM 파라미터 설정¶

LightGBM 학습 및 평가¶

Oversampling을 수행¶

SMOTE 이용한 Over Sampling 수행¶

동일한 코드를 이용하여 LightGBM 다시 수행 후 성능향상 확인¶

데이터셋 객체 생성¶

학습 및 평가 수행¶

Over sampling이 통한다면...¶

그렇다면, Oversampling 크기를 증가시킨다면? (30% $\to$ 60%)¶

60% 증강된 데이터를 적용한 객체 생성¶

학습 수행¶

60% 증강 성능 분석¶

극단적인 경우 테스트 (100%)¶

100% 증강된 데이터를 적용한 객체 생성¶

학습 수행¶

100% 증강 성능 분석¶

30%, 60%, 100% 증강에 따른 성능 비교¶

다양한 실험 결과 $\to$ 결론: 30%가 가장 적합¶

BLSMOTE 보다 기본 SMOTE가 성능이 좋다.¶

이 경험적 지식을 바탕으로 다양한 모델에 적용해본다¶

다양한 모델의 성능 비교¶

Bagging 적용¶

Bagging을 바탕으로 예측한 결과값에 대해 다수결로 예측¶