실습 문제 🪄
 실습 주제 : 에너지 사용 패턴 확인을 통한 부하 타입 분류하기
 
* 실습 주제 : 에너지 사용 패턴 확인을 통한 부하 타입 분류하기
* 사용 데이터 : Steel Industry Energy Consumption
* 포함 사항
  - 전처리(결측, 중복, 이상치 데이터 확인 및 처리(처리할 특성이 있는 경우))
  - 상관관계 분석 및 시각화
  - 상관관계 검증
  - 정규화 및 표준화(Standard, MinMax) 각각 진행
  - 머신러닝 분류모델 전체, 다층퍼셉트론에 대한 하이퍼파라메터 튜닝
  - 분류 성능평가(평가 방법 모두)
  - 혼동행렬, 혼동행렬도 시각화
  - 정규화 및 표준화 별로 각각 가장 우수한 모델 선정
  - 최종 가장 우수한 모델 선정
  - 사용할 모델 : 머신러닝 분류모델 전체, 다층퍼셉트론, DNN, 이외 자유롭게 추가
                 (DNN은 하이퍼파라메터 튜닝 안함)
* 코드 작성 방법
1. 함수 없이 전체 코드 작성 및 실행
2. 기능별로 함수 사용하여 코드 작성 실행
  - 최초 시작 함수 이름 : main()
  
* 주피터 파일명
  - 본인이름_에너지_부하타입_분류하기_실습.ipynb
라이브러리 🪄
"""데이터 처리"""
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
### 한글처리
plt.rc("font", family="Malgun Gothic")
### 마이너스
plt.rcParams["axes.unicode_minus"] = False
"""스피어만 상관관계 검정"""
from scipy.stats import spearmanr
"""피어슨 상관관계 검정"""
from scipy.stats import pearsonr
"""데이터 분류"""
from sklearn.model_selection import train_test_split
"""하이퍼파라메터 튜닝"""
from sklearn.model_selection import GridSearchCV
"""사용할 분류모델들"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
"""HistGradientBoostingClassifier 에 대한 특성중요도 추출 라이브러리"""
from sklearn.inspection import permutation_importance
"""데이터 스케일링 : 정규화 및 표준화"""
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
"""평가"""
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
"""오차행렬(혼동행렬)"""
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
"""딥러닝"""
import tensorflow as tf
from tensorflow import keras
"""다층 퍼셉트론 모델"""
from sklearn.neural_network import MLPClassifier
"""실행 결과를 동일하게 하기 위한 처리(완전 동일하지 않을 수도 있음)"""
tf.keras.utils.set_random_seed(42)
"""연산 고정"""
tf.config.experimental.enable_op_determinism()
실습 함수 실행 🪄
실습 함수화 : 기능별로 함수 사용하여 코드 작성 실행
""" 데이터를 불러오는 함수 """
def load_data(file_path):
    data = pd.read_csv(file_path)
    ### WeekStatus 형변환
    data=data.replace({"Weekday":0, "Weekend":1})
    ### Day_of_week 형변환
    data=data.replace({"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6})
    ### Load_Type 형변환
    data=data.replace({"Light_Load":0, "Medium_Load":1, "Maximum_Load":2})
    
    ### date를 각 column별로 쪼개기
    data["day"] = pd.to_numeric(data["date"].str.slice(start=0, stop=2))
    data["month"] = pd.to_numeric(data["date"].str.slice(start=3, stop=5))
    data["year"] =  pd.to_numeric(data["date"].str.slice(start=6, stop=10))
    data["time"] = pd.to_numeric(data["date"].str.slice(start=10, stop=13))*60 + pd.to_numeric(data["date"].str.slice(start=14))
    ### date컬럼 삭제
    data.drop("date", axis=1, inplace=True)    
    return data
""" 데이터 전처리 함수 """
def check_data(data):
    ### 결측치 확인 → 없음
    info = data.info()
    ### 중복 데이터 확인 → 없음
    dup = data.duplicated()
    dup = data[dup]
    ### 이상치 확인
    des = data.describe()
    
    ### boxplot 분석
    plt.figure(figsize=(15, 5))
    data.boxplot()
    plt.show()
    
    data2= data.drop("NSM", axis=1)
    
    plt.figure(figsize=(15, 5))
    data2.boxplot()
    plt.show()
    
    print(f"결측치 : {info} \n 중복값 : {dup} \n 이상치 : {des} \n 모두 이상 없어 보임!!")
    print("boxplot 분석 결과 이상치로 의심되는 데이터이 보이지만 \n 그 양이 많고 촘촘한 편이므로 평균값과 차이가 있을 뿐 \n 이상치라고 판단하기는 어려워보인다.")
"""상관관계 분석 및 시각화"""
def analyze_and_visualize(data):
    correlation_matrix = data.corr()
    ### 상관관계 시각화1 - SIEC 히트맵
    plt.figure(figsize=(10,6))
    plt.title("SIEC 히트맵")
    sns.heatmap(correlation_matrix, annot = True, fmt=".3f", linewidths=0.5 ,cmap="coolwarm")
    plt.show()
    print("year 데이터는 상관관계가 보이지 않으므로 제거해도 될꺼 같아보인다")
    data.drop("year", axis=1, inplace=True)
    ## 상관관계 시각화2 - 산점행렬도
    sns.pairplot(data)
    plt.suptitle("산점행렬도", y=1)
    plt.show()
"""독립변수, 종속변수 분리"""
def split_independent_dependent(data):
    X = data[[col for col in data.columns if col != "Load_Type"]]
    y = data["Load_Type"]
    return X, y
"""상관관계 검증"""
def test_correlation(data, X, y):
    """스피어만 상관관계 검정 - 비선형적, 순서형 데이터"""
    for column in X.columns :
        corr, p_value = spearmanr(X[column], y)
        print(f"{column} vs Load_Type : corr={corr:.4f} / p_value={p_value:.4f}")
        # 통상적으로 사용되는 유의수준 (0.05)을 기준으로 p-value를 검정
        if p_value < 0.05:
            print("귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.")
        else:
            print("귀무가설을 기각할 수 없으며, 두 변수 간에는 통계적으로 유의한 상관관계가 없습니다.")
            
    print("\n스피어만 상관관계 검증을 통해 각 독립변수들의 p-value 값을 확인한 결과 \n각 독립변수와 종속변수의 p-value는 < 0.05 이므로 모두 유의미한 데이터로 생각됩니다.")
"""정규화 + 표준화"""
def compare_scaling_methods(X, y):
    scaling_methods = ["StandardScaler", "MinMaxScaler"]
    df_list = []
    for i in scaling_methods:
        
        print(f"[ {i} ]")
    
            ###정규화
        if i == "StandardScaler":
            ss = StandardScaler()
            scaled_X = ss.fit_transform(X)
            
            """훈련 : 검증 : 테스트 데이터 분류하기 = 6 : 2 : 2"""        
            train_X, temp_X, train_y, temp_y = train_test_split(scaled_X, y, test_size=0.4, random_state=42)
            val_X, test_X, val_y, test_y = train_test_split(temp_X, temp_y, test_size=0.5, random_state=42)
            """머신러닝 분류 모델"""
            df_ml_ss = ml_classify_model(train_X, train_y, val_X, val_y, test_X, test_y)
            """DNN"""
            df_dnn_ss = dl_classify_model(train_X, train_y, val_X, val_y, test_X, test_y)
            df_ml_ss = pd.concat([df_ml_ss, df_dnn_ss], axis=0, ignore_index=True)
            df_list.append(df_ml_ss)
            
            ###표준화  
        else:
            mm = MinMaxScaler()
            scaled_X = mm.fit_transform(X)
            
            """훈련 : 검증 : 테스트 데이터 분류하기 = 6 : 2 : 2"""
            train_X, temp_X, train_y, temp_y = train_test_split(scaled_X, y, test_size=0.4, random_state=42)
            val_X, test_X, val_y, test_y = train_test_split(temp_X, temp_y, test_size=0.5, random_state=42)
            
            """머신러닝 분류 모델"""
            df_ml_mm = ml_classify_model(train_X, train_y, val_X, val_y, test_X, test_y)
            
            """DNN"""
            df_dnn_mm = dl_classify_model(train_X, train_y, val_X, val_y, test_X, test_y)
            df_ml_mm = pd.concat([df_ml_mm, df_dnn_mm], axis=0, ignore_index=True)
            df_list.append(df_ml_mm)
    return df_list, train_X, train_y, val_X, val_y, test_X, test_y
"""머신러닝 분류모델"""
def ml_classify_model(train_X, train_y, val_X, val_y, test_X, test_y) :
    rf_model = RandomForestClassifier(n_jobs=-1, random_state=42)
    et_model = ExtraTreesClassifier(n_jobs=-1, random_state=42)
    gb_model = GradientBoostingClassifier()
    hg_model = HistGradientBoostingClassifier()
    xgb_model = XGBClassifier(n_jobs=-1, random_state=42)
    mlp_model = MLPClassifier(random_state=42)
    models = [rf_model, et_model, gb_model, hg_model, xgb_model, mlp_model]
    model_name = [ name.__class__.__name__ for name in models]
    df = pd.DataFrame()
    for model, modelName in zip(models, model_name):
        print(f"--------------------------< {modelName} >---------------------")
    
    
        """하이퍼파라메터 매개변수 정의하기-------------------------------------------------"""
        if modelName == "HistGradientBoostingClassifier":
            param_grid = {
                "max_iter"  : [25, 50],
                "max_depth" : [10],
                "min_samples_leaf" : [1, 2, 4]
            }
            
        elif modelName == "XGBClassifier" :
            param_grid = {
                "n_estimators"  : [25, 50],
                "max_depth" : [10],
                "min_child_weight" : [1, 2, 4]
            }
    
        elif modelName == "MLPClassifier" :
            param_grid = {
                "hidden_layer_sizes" : [(10,), (50,), (100,)],
                "alpha" : [0.0001, 0.001, 0.01],
                "max_iter" : [1000]
            }
            
        else :
            param_grid = {
                "n_estimators"  : [25, 50],
                "max_depth" : [10],
                "min_samples_split" : [2, 5],
                "min_samples_leaf" : [1, 2, 4]
            }
    
        """그리드 서치 CV 수행하기 ------------------------------------------------------"""
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring = "accuracy", n_jobs=-1)
        grid_search.fit(train_X, train_y)
    
        
        """최적의 모델 받아오기"""
        model = grid_search.best_estimator_
    
        
        """훈련 및 검증 정확도 확인하기 --------------------------------------------------"""
        
        """정확도를 확인하기 위하여 -> 예측하기 """
        """훈련 및 검증 정확도 확인하기"""
        train_score = model.score(train_X, train_y)
        val_score = model.score(val_X, val_y)
    
        """훈련 및 검증 예측하기"""
        train_pred = model.predict(train_X)
        val_pred = model.predict(val_X)
        
        """정확도 확인하기"""
        train_acc = accuracy_score(train_y, train_pred)
        val_acc = accuracy_score(val_y, val_pred)
    
        
        """최종 테스트 평가하기 -----------------------------------------------------------------------"""
        """ 테스트 데이터로 예측하기"""
        test_pred = model.predict(test_X)
    
        """정확도"""
        test_acc = accuracy_score(test_y, test_pred)
        """정밀도"""
        precision = precision_score(test_y, test_pred, average="micro")
        """재현율"""
        recall = recall_score(test_y, test_pred, average="micro")
        """f1-score"""
        f1 = f1_score(test_y, test_pred, average="micro")
    
        """오차행렬(혼동행렬)"""
        cm = confusion_matrix(test_y, test_pred)
        
        """혼동행렬 시각화"""
        plt.figure(figsize=(8,4))
        plt.title(f"Confusion Matrix - {modelName}")
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=["0","1","2"], yticklabels=["0", "1", "2"])
        plt.savefig(f"./img/confusion_matrix_{modelName}.png")
        # plt.show()
    
        """데이터프레임에 담기--------------------------------------------------------------------------"""
        df_temp = pd.DataFrame({        
            "모델명" : [modelName],
            "훈련정확도" : [train_acc],
            "검증정확도" : [val_acc],
            "테스트정확도" : [test_acc],
            "(훈련-검증)" : [train_acc - val_acc],
            "정밀도" : [precision],
            "재현율" : [recall],
            "f1-score" : [f1],
            "혼동행렬" : [cm]
        })
    
        """하나의 데이터프레임에 추가하기"""
        df = pd.concat([df, df_temp], ignore_index=True)
    return df
"""딥러닝 분류모델 - DNN"""
def dl_classify_model(train_X, train_y, val_X, val_y, test_X, test_y):
    print(f"--------------------------< DNN >---------------------")
    model = keras.Sequential()
    
    """계층 생성 및 모델 설정"""
    model.add(keras.layers.Dense(64, activation="relu", input_dim=12))
    model.add(keras.layers.Dense(32, activation="relu"))
    model.add(keras.layers.Dense(3, activation="softmax"))
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    """콜백함수"""
    checkpoint_cb = keras.callbacks.ModelCheckpoint("./model/best_model.h5", save_best_only=True)
    early_stopping_cb = keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)
    
    """모델 훈련"""
    model.fit(train_X, train_y, epochs=100, validation_data=(val_X, val_y), callbacks=[checkpoint_cb, early_stopping_cb])
    
    """최종 테스트 평가하기 -----------------------------------------------------------------------"""
    """ 예측하기 """
    y_pred = model.predict(test_X)
    y_pred = np.argmax(y_pred, axis=1)
    """정확도"""
    test_acc = accuracy_score(test_y, y_pred)
    
    """정밀도"""
    pre = precision_score(test_y, y_pred, average='micro')
    
    """재현율"""
    rec = recall_score(test_y, y_pred, average='micro')
    
    """f1-score"""
    f1 = f1_score(test_y, y_pred, average='micro')
    
    """confusion_matrix"""
    conf = confusion_matrix(test_y, y_pred)
    
    
    df_DNN = pd.DataFrame({        
            "모델명" : "DNN",
            "훈련정확도" : [None],
            "검증정확도" : [None],
            "테스트정확도" : [test_acc],
            "(훈련-검증)" : [None],
            "정밀도" : [pre],
            "재현율" : [rec],
            "f1-score" : [f1],
            "혼동행렬" : [conf]
        })
    """혼동행렬도"""
    sns.heatmap(conf, annot=True, fmt="d", cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig("./img/confusion_matrix.png")
    plt.show()
    return df_DNN
'"""메인 실행함수"""
def main():
    # 데이터 파일 경로
    file_path = './data/Steel_industry_data.csv'
    
    print("\n[데이터 불러오기]\n")
    data = load_data(file_path)
    print(data)
    
    print("\n[데이터 전처리]\n")
    check_data(data)
    
    print("\n[상관관계 분석 및 시각화]\n")
    analyze_and_visualize(data)
    """\n[독립변수 종속변수 분리]\n"""
    X, y = split_independent_dependent(data)
    
    print("\n[상관관계 검증]\n")
    test_correlation(data, X, y)
    print("\n[정규화 + 표준화]\n")
    df_list , train_X, train_y, val_X, val_y, test_X, test_y = compare_scaling_methods(X, y)
    print("\n[결과 데이터프레임 리스트]\n")
    for idx, df in enumerate(df_list, 1):
        print(f"DataFrame {idx}:\n")
        print(df)
        print("\n")
    
    print("비교 결과 StandardScaler로 정규화 시켰을 때 모델은 정확도가 제일 높으면서 일반화가 잘 되어있는 HistGradientBoostingClassifier 분류모델이 좋을것 같고 MinMaxScaler로 표준화 시켰을 때 모델 또한 HistGradientBoostingClassifier이 좋아보인다.")
    
    
if __name__ == "__main__":
    main()
실행결과
[데이터 불러오기] 
       Usage_kWh  Lagging_Current_Reactive.Power_kVarh  \ 
0           3.17                                  2.95    
1           4.00                                  4.46    
2           3.24                                  3.28    
3           3.31                                  3.56    
4           3.82                                  4.50    
...          ...                                   ...    
35035       3.85                                  4.86    
35036       3.74                                  3.74    
35037       3.78                                  3.17    
35038       3.78                                  3.06    
35039       3.67                                  3.02    
       Leading_Current_Reactive_Power_kVarh  CO2(tCO2)  \ 
0                                      0.00        0.0    
1                                      0.00        0.0    
2                                      0.00        0.0    
3                                      0.00        0.0    
4                                      0.00        0.0    
...                                     ...        ...    
35035                                  0.00        0.0    
35036                                  0.00        0.0    
35037                                  0.07        0.0    
35038                                  0.11        0.0    
35039                                  0.07        0.0    
       Lagging_Current_Power_Factor  Leading_Current_Power_Factor    NSM  \ 
0                             73.21                        100.00    900    
1                             66.77                        100.00   1800    
2                             70.28                        100.00   2700    
3                             68.09                        100.00   3600    
4                             64.72                        100.00   4500    
...                             ...                           ...    ...    
35035                         62.10                        100.00  82800    
35036                         70.71                        100.00  83700    
35037                         76.62                         99.98  84600    
35038                         77.72                         99.96  85500    
35039                         77.22                         99.98      0    
       WeekStatus  Day_of_week  Load_Type  day  month  year  time   
0               0            0          0    1      1  2018    15   
1               0            0          0    1      1  2018    30   
2               0            0          0    1      1  2018    45   
3               0            0          0    1      1  2018    60   
4               0            0          0    1      1  2018    75   
...           ...          ...        ...  ...    ...   ...   ...   
35035           0            0          0   31     12  2018  1380   
35036           0            0          0   31     12  2018  1395   
35037           0            0          0   31     12  2018  1410   
35038           0            0          0   31     12  2018  1425   
35039           0            0          0   31     12  2018     0   
[35040 rows x 14 columns] 
[데이터 전처리] 
<class 'pandas.core.frame.DataFrame'> 
RangeIndex: 35040 entries, 0 to 35039 
Data columns (total 14 columns): 
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   Usage_kWh                             35040 non-null  float64 
 1   Lagging_Current_Reactive.Power_kVarh  35040 non-null  float64 
 2   Leading_Current_Reactive_Power_kVarh  35040 non-null  float64 
 3   CO2(tCO2)                             35040 non-null  float64 
 4   Lagging_Current_Power_Factor          35040 non-null  float64 
 5   Leading_Current_Power_Factor          35040 non-null  float64 
 6   NSM                                   35040 non-null  int64   
 7   WeekStatus                            35040 non-null  int64   
 8   Day_of_week                           35040 non-null  int64   
 9   Load_Type                             35040 non-null  int64   
 10  day                                   35040 non-null  int64   
 11  month                                 35040 non-null  int64   
 12  year                                  35040 non-null  int64   
 13  time                                  35040 non-null  int64   
dtypes: float64(6), int64(8) 
memory usage: 3.7 MB


결측치 : None  
 중복값 : Empty DataFrame 
Columns: [Usage_kWh, Lagging_Current_Reactive.Power_kVarh, Leading_Current_Reactive_Power_kVarh, CO2(tCO2), Lagging_Current_Power_Factor, Leading_Current_Power_Factor, NSM, WeekStatus, Day_of_week, Load_Type, day, month, year, time] 
Index: []  
 이상치 :           Usage_kWh  Lagging_Current_Reactive.Power_kVarh  \ 
count  35040.000000                          35040.000000    
mean      27.386892                             13.035384    
std       33.444380                             16.306000    
min        0.000000                              0.000000    
25%        3.200000                              2.300000    
50%        4.570000                              5.000000    
75%       51.237500                             22.640000    
max      157.180000                             96.910000    
       Leading_Current_Reactive_Power_kVarh     CO2(tCO2)  \ 
count                          35040.000000  35040.000000    
mean                               3.870949      0.011524    
std                                7.424463      0.016151    
min                                0.000000      0.000000    
25%                                0.000000      0.000000    
50%                                0.000000      0.000000    
75%                                2.090000      0.020000    
max                               27.760000      0.070000    
       Lagging_Current_Power_Factor  Leading_Current_Power_Factor  \ 
count                  35040.000000                  35040.000000    
mean                      80.578056                     84.367870    
std                       18.921322                     30.456535    
min                        0.000000                      0.000000    
25%                       63.320000                     99.700000    
50%                       87.960000                    100.000000    
75%                       99.022500                    100.000000    
max                      100.000000                    100.000000    
                NSM    WeekStatus   Day_of_week     Load_Type           day  \ 
count  35040.000000  35040.000000  35040.000000  35040.000000  35040.000000    
mean   42750.000000      0.284932      2.991781      0.691781     15.720548    
std    24940.534317      0.451388      2.003433      0.792658      8.796373    
min        0.000000      0.000000      0.000000      0.000000      1.000000    
25%    21375.000000      0.000000      1.000000      0.000000      8.000000    
50%    42750.000000      0.000000      3.000000      0.000000     16.000000    
75%    64125.000000      1.000000      5.000000      1.000000     23.000000    
max    85500.000000      1.000000      6.000000      2.000000     31.000000    
              month     year          time   
count  35040.000000  35040.0  35040.000000   
mean       6.526027   2018.0    712.500000   
std        3.447901      0.0    415.675572   
min        1.000000   2018.0      0.000000   
25%        4.000000   2018.0    356.250000   
50%        7.000000   2018.0    712.500000   
75%       10.000000   2018.0   1068.750000   
max       12.000000   2018.0   1425.000000    
 모두 이상 없어 보임!! 
boxplot 분석 결과 이상치로 의심되는 데이터이 보이지만  
 그 양이 많고 촘촘한 편이므로 평균값과 차이가 있을 뿐  
 이상치라고 판단하기는 어려워보인다. 
[상관관계 분석 및 시각화] 

year 데이터는 상관관계가 보이지 않으므로 제거해도 될꺼 같아보인다 
[상관관계 검증] 
Usage_kWh vs Load_Type : corr=0.6964 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
Lagging_Current_Reactive.Power_kVarh vs Load_Type : corr=0.3602 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
Leading_Current_Reactive_Power_kVarh vs Load_Type : corr=0.0835 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
CO2(tCO2) vs Load_Type : corr=0.6656 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
Lagging_Current_Power_Factor vs Load_Type : corr=0.4548 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
Leading_Current_Power_Factor vs Load_Type : corr=-0.0280 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
NSM vs Load_Type : corr=0.5255 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
WeekStatus vs Load_Type : corr=-0.2106 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
Day_of_week vs Load_Type : corr=-0.1778 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
day vs Load_Type : corr=0.0149 / p_value=0.0054 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
month vs Load_Type : corr=-0.0137 / p_value=0.0104 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
time vs Load_Type : corr=0.5255 / p_value=0.0000 
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다. 
스피어만 상관관계 검증을 통해 각 독립변수들의 p-value 값을 확인한 결과  
각 독립변수와 종속변수의 p-value는 < 0.05 이므로 모두 유의미한 데이터로 생각됩니다. 
[정규화 + 표준화] 
[ StandardScaler ] 
--------------------------< RandomForestClassifier >--------------------- 
--------------------------< ExtraTreesClassifier >--------------------- 
--------------------------< GradientBoostingClassifier >--------------------- 
--------------------------< HistGradientBoostingClassifier >--------------------- 
--------------------------< XGBClassifier >--------------------- 
--------------------------< MLPClassifier >--------------------- 
Epoch 1/100 
657/657 [==============================] - 3s 3ms/step - loss: 0.4841 - accuracy: 0.7754 - val_loss: 0.3974 - val_accuracy: 0.8122 
Epoch 2/100 
657/657 [==============================] - 2s 3ms/step - loss: 0.3592 - accuracy: 0.8372 - val_loss: 0.3263 - val_accuracy: 0.8557 
Epoch 3/100 
657/657 [==============================] - 2s 3ms/step - loss: 0.3005 - accuracy: 0.8633 - val_loss: 0.2811 - val_accuracy: 0.8760 
Epoch 4/100 
657/657 [==============================] - 2s 3ms/step - loss: 0.2646 - accuracy: 0.8819 - val_loss: 0.2754 - val_accuracy: 0.8723 
Epoch 5/100 
657/657 [==============================] - 2s 4ms/step - loss: 0.2396 - accuracy: 0.8964 - val_loss: 0.2424 - val_accuracy: 0.8945 
Epoch 6/100 
657/657 [==============================] - 2s 3ms/step - loss: 0.2190 - accuracy: 0.9084 - val_loss: 0.2232 - val_accuracy: 0.9004 
Epoch 7/100 
657/657 [==============================] - 3s 4ms/step - loss: 0.2044 - accuracy: 0.9152 - val_loss: 0.2123 - val_accuracy: 0.9081 
Epoch 8/100 
657/657 [==============================] - 2s 4ms/step - loss: 0.1948 - accuracy: 0.9187 - val_loss: 0.1941 - val_accuracy: 0.9208 
Epoch 9/100 
657/657 [==============================] - 3s 4ms/step - loss: 0.1835 - accuracy: 0.9250 - val_loss: 0.1789 - val_accuracy: 0.9298 
Epoch 10/100 
657/657 [==============================] - 2s 3ms/step - loss: 0.1731 - accuracy: 0.9287 - val_loss: 0.1792 - val_accuracy: 0.9247 
Epoch 11/100 
657/657 [==============================] - 2s 3ms/step - loss: 0.1683 - accuracy: 0.9320 - val_loss: 0.1749 - val_accuracy: 0.9248 
Epoch 12/100 
657/657 [==============================] - 2s 3ms/step - loss: 0.1594 - accuracy: 0.9364 - val_loss: 0.1739 - val_accuracy: 0.9269 
Epoch 13/100 
657/657 [==============================] - 1s 2ms/step - loss: 0.1541 - accuracy: 0.9392 - val_loss: 0.1639 - val_accuracy: 0.9334 
Epoch 14/100 
657/657 [==============================] - 1s 2ms/step - loss: 0.1481 - accuracy: 0.9404 - val_loss: 0.1499 - val_accuracy: 0.9408 
Epoch 15/100 
657/657 [==============================] - 1s 2ms/step - loss: 0.1427 - accuracy: 0.9436 - val_loss: 0.1545 - val_accuracy: 0.9369 
Epoch 16/100 
657/657 [==============================] - 1s 2ms/step - loss: 0.1397 - accuracy: 0.9443 - val_loss: 0.1671 - val_accuracy: 0.9274






[ MinMaxScaler ] 
--------------------------< RandomForestClassifier >--------------------- 
--------------------------< ExtraTreesClassifier >--------------------- 
--------------------------< GradientBoostingClassifier >--------------------- 
--------------------------< HistGradientBoostingClassifier >--------------------- 
--------------------------< XGBClassifier >--------------------- 
--------------------------< MLPClassifier >--------------------- 
Epoch 1/100 
657/657 [==============================] - 1s 863us/step - loss: 0.5828 - accuracy: 0.7333 - val_loss: 0.4731 - val_accuracy: 0.7928 
Epoch 2/100 
657/657 [==============================] - 1s 796us/step - loss: 0.4549 - accuracy: 0.7887 - val_loss: 0.4384 - val_accuracy: 0.8034 
Epoch 3/100 
657/657 [==============================] - 1s 793us/step - loss: 0.4217 - accuracy: 0.8027 - val_loss: 0.4000 - val_accuracy: 0.8212 
Epoch 4/100 
657/657 [==============================] - 0s 724us/step - loss: 0.3906 - accuracy: 0.8232 - val_loss: 0.4063 - val_accuracy: 0.8225 
Epoch 5/100 
657/657 [==============================] - 1s 783us/step - loss: 0.3612 - accuracy: 0.8397 - val_loss: 0.3526 - val_accuracy: 0.8398 
Epoch 6/100 
657/657 [==============================] - 1s 786us/step - loss: 0.3343 - accuracy: 0.8473 - val_loss: 0.3271 - val_accuracy: 0.8527 
Epoch 7/100 
657/657 [==============================] - 1s 789us/step - loss: 0.3153 - accuracy: 0.8543 - val_loss: 0.3262 - val_accuracy: 0.8522 
Epoch 8/100 
657/657 [==============================] - 1s 766us/step - loss: 0.2991 - accuracy: 0.8625 - val_loss: 0.2911 - val_accuracy: 0.8663 
Epoch 9/100 
657/657 [==============================] - 1s 800us/step - loss: 0.2850 - accuracy: 0.8708 - val_loss: 0.2757 - val_accuracy: 0.8783 
Epoch 10/100 
657/657 [==============================] - 1s 782us/step - loss: 0.2717 - accuracy: 0.8760 - val_loss: 0.2729 - val_accuracy: 0.8760 
Epoch 11/100 
657/657 [==============================] - 1s 793us/step - loss: 0.2630 - accuracy: 0.8833 - val_loss: 0.2626 - val_accuracy: 0.8838 
Epoch 12/100 
657/657 [==============================] - 1s 788us/step - loss: 0.2530 - accuracy: 0.8872 - val_loss: 0.2480 - val_accuracy: 0.8843 
Epoch 13/100 
657/657 [==============================] - 1s 803us/step - loss: 0.2466 - accuracy: 0.8897 - val_loss: 0.2472 - val_accuracy: 0.8920 
Epoch 14/100 
657/657 [==============================] - 1s 770us/step - loss: 0.2397 - accuracy: 0.8967 - val_loss: 0.2419 - val_accuracy: 0.8931 
Epoch 15/100 
657/657 [==============================] - 1s 779us/step - loss: 0.2328 - accuracy: 0.9000 - val_loss: 0.2354 - val_accuracy: 0.8968 
Epoch 16/100 
657/657 [==============================] - 1s 768us/step - loss: 0.2303 - accuracy: 0.8991 - val_loss: 0.2483 - val_accuracy: 0.8807 
Epoch 17/100 
657/657 [==============================] - 1s 783us/step - loss: 0.2241 - accuracy: 0.9039 - val_loss: 0.2347 - val_accuracy: 0.8965 
Epoch 18/100 
657/657 [==============================] - 1s 798us/step - loss: 0.2197 - accuracy: 0.9072 - val_loss: 0.2195 - val_accuracy: 0.9055 
Epoch 19/100 
657/657 [==============================] - 1s 759us/step - loss: 0.2178 - accuracy: 0.9065 - val_loss: 0.2483 - val_accuracy: 0.8864 
Epoch 20/100 
657/657 [==============================] - 1s 793us/step - loss: 0.2108 - accuracy: 0.9105 - val_loss: 0.2141 - val_accuracy: 0.9110 
Epoch 21/100 
657/657 [==============================] - 1s 810us/step - loss: 0.2093 - accuracy: 0.9126 - val_loss: 0.2051 - val_accuracy: 0.9142 
Epoch 22/100 
657/657 [==============================] - 0s 755us/step - loss: 0.2045 - accuracy: 0.9167 - val_loss: 0.2224 - val_accuracy: 0.9055 
Epoch 23/100 
657/657 [==============================] - 0s 743us/step - loss: 0.2043 - accuracy: 0.9150 - val_loss: 0.2198 - val_accuracy: 0.9004






[결과 데이터프레임 리스트] 
DataFrame 1: 
                              모델명     훈련정확도     검증정확도    테스트정확도   (훈련-검증)  \ 
0          RandomForestClassifier  0.985683  0.975742  0.977312  0.009941    
1            ExtraTreesClassifier  0.907344  0.894264  0.887985  0.013080    
2      GradientBoostingClassifier  1.000000  0.996005  0.996005  0.003995    
3  HistGradientBoostingClassifier  0.999810  0.997860  0.998858  0.001950    
4                   XGBClassifier  1.000000  0.998002  0.998288  0.001998    
5                   MLPClassifier  0.972888  0.961045  0.957049  0.011844    
6                             DNN       NaN       NaN  0.933790       NaN    
        정밀도       재현율  f1-score  \ 
0  0.977312  0.977312  0.977312    
1  0.887985  0.887985  0.887985    
2  0.996005  0.996005  0.996005    
3  0.998858  0.998858  0.998858    
4  0.998288  0.998288  0.998288    
5  0.957049  0.957049  0.957049    
6  0.933790  0.933790  0.933790    
                                                혼동행렬   
0    [[3572, 32, 7], [58, 1865, 22], [18, 22, 1412]]   
1  [[3532, 76, 3], [133, 1514, 298], [48, 227, 11...   
2         [[3594, 8, 9], [4, 1940, 1], [6, 0, 1446]]   
3         [[3605, 1, 5], [0, 1945, 0], [2, 0, 1450]]   
4         [[3602, 2, 7], [1, 1944, 0], [2, 0, 1450]]   
5   [[3537, 62, 12], [69, 1811, 65], [30, 63, 1359]]   
6  [[3476, 120, 15], [112, 1749, 84], [51, 82, 13...   
DataFrame 2: 
                              모델명     훈련정확도     검증정확도    테스트정확도   (훈련-검증)  \ 
0          RandomForestClassifier  0.985683  0.975885  0.977454  0.009798    
1            ExtraTreesClassifier  0.907344  0.894264  0.887985  0.013080    
2      GradientBoostingClassifier  1.000000  0.996147  0.995719  0.003853    
3  HistGradientBoostingClassifier  0.999715  0.998716  0.998573  0.000999    
4                   XGBClassifier  1.000000  0.998002  0.998288  0.001998    
5                   MLPClassifier  0.958524  0.954481  0.947631  0.004043    
6                             DNN       NaN       NaN  0.911387       NaN    
        정밀도       재현율  f1-score  \ 
0  0.977454  0.977454  0.977454    
1  0.887985  0.887985  0.887985    
2  0.995719  0.995719  0.995719    
3  0.998573  0.998573  0.998573    
4  0.998288  0.998288  0.998288    
5  0.947631  0.947631  0.947631    
6  0.911387  0.911387  0.911387    
                                                혼동행렬   
0    [[3571, 33, 7], [58, 1867, 20], [18, 22, 1412]]   
1  [[3532, 76, 3], [133, 1514, 298], [48, 227, 11...   
2        [[3592, 11, 8], [3, 1941, 1], [7, 0, 1445]]   
3         [[3603, 2, 6], [0, 1945, 0], [2, 0, 1450]]   
4         [[3602, 2, 7], [1, 1944, 0], [2, 0, 1450]]   
5   [[3505, 81, 25], [88, 1759, 98], [29, 46, 1377]]   
6  [[3443, 140, 28], [143, 1648, 154], [35, 121, ...   
비교 결과 StandardScaler로 정규화 시켰을 때 모델은 정확도가 제일 높으면서 일반화가 잘 되어있는 HistGradientBoostingClassifier 분류모델이 좋을것 같고 MinMaxScaler로 표준화 시켰을 때 모델 또한 HistGradientBoostingClassifier이 좋아보인다.
'인공지능 > 딥러닝' 카테고리의 다른 글
| [YOLO] YOLO 카메라 객체탐지 (2) | 2024.01.11 | 
|---|---|
| [YOLO] YOLO 설치 및 객체탐지 Image사용 (1) | 2024.01.10 | 
| [딥러닝DL]합성곱신경망(CNN)을 이용한 이미지 분류 (1) | 2024.01.08 | 
| [딥러닝DL] RNN응용 규칙기반 챗봇 (0) | 2024.01.08 | 
| [딥러닝DL] 순환신경망(RNN) - 장기기억 순환신경망(LSTM), 게이트웨이 반복 순환신경망(GRU) (3) | 2024.01.08 | 
 
									
								 
									
								 
									
								 
									
								