본문 바로가기

인공지능/딥러닝

[머신러닝 + 딥러닝] 에너지 사용 패턴 확인을 통한 부하 타입 분류 실습해보기

~ 목차 ~

실습 문제 🪄

 

실습 주제 : 에너지 사용 패턴 확인을 통한 부하 타입 분류하기

 

* 실습 주제 : 에너지 사용 패턴 확인을 통한 부하 타입 분류하기
* 사용 데이터 : Steel Industry Energy Consumption

* 포함 사항
  - 전처리(결측, 중복, 이상치 데이터 확인 및 처리(처리할 특성이 있는 경우))
  - 상관관계 분석 및 시각화
  - 상관관계 검증
  - 정규화 및 표준화(Standard, MinMax) 각각 진행
  - 머신러닝 분류모델 전체, 다층퍼셉트론에 대한 하이퍼파라메터 튜닝
  - 분류 성능평가(평가 방법 모두)
  - 혼동행렬, 혼동행렬도 시각화
  - 정규화 및 표준화 별로 각각 가장 우수한 모델 선정
  - 최종 가장 우수한 모델 선정
  - 사용할 모델 : 머신러닝 분류모델 전체, 다층퍼셉트론, DNN, 이외 자유롭게 추가
                 (DNN은 하이퍼파라메터 튜닝 안함)

* 코드 작성 방법
1. 함수 없이 전체 코드 작성 및 실행

2. 기능별로 함수 사용하여 코드 작성 실행
  - 최초 시작 함수 이름 : main()
  
* 주피터 파일명
  - 본인이름_에너지_부하타입_분류하기_실습.ipynb

 

 

 


 

 

라이브러리 🪄

 

"""데이터 처리"""
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

### 한글처리
plt.rc("font", family="Malgun Gothic")
### 마이너스
plt.rcParams["axes.unicode_minus"] = False

"""스피어만 상관관계 검정"""
from scipy.stats import spearmanr
"""피어슨 상관관계 검정"""
from scipy.stats import pearsonr

"""데이터 분류"""
from sklearn.model_selection import train_test_split

"""하이퍼파라메터 튜닝"""
from sklearn.model_selection import GridSearchCV

"""사용할 분류모델들"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

"""HistGradientBoostingClassifier 에 대한 특성중요도 추출 라이브러리"""
from sklearn.inspection import permutation_importance

"""데이터 스케일링 : 정규화 및 표준화"""
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

"""평가"""
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

"""오차행렬(혼동행렬)"""
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

"""딥러닝"""
import tensorflow as tf
from tensorflow import keras

"""다층 퍼셉트론 모델"""
from sklearn.neural_network import MLPClassifier

"""실행 결과를 동일하게 하기 위한 처리(완전 동일하지 않을 수도 있음)"""
tf.keras.utils.set_random_seed(42)

"""연산 고정"""
tf.config.experimental.enable_op_determinism()

 

 

 

실습 함수 실행 🪄

 

실습 함수화 : 기능별로 함수 사용하여 코드 작성 실행

 

""" 데이터를 불러오는 함수 """

def load_data(file_path):
    data = pd.read_csv(file_path)
    ### WeekStatus 형변환
    data=data.replace({"Weekday":0, "Weekend":1})
    ### Day_of_week 형변환
    data=data.replace({"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6})
    ### Load_Type 형변환
    data=data.replace({"Light_Load":0, "Medium_Load":1, "Maximum_Load":2})
    
    ### date를 각 column별로 쪼개기
    data["day"] = pd.to_numeric(data["date"].str.slice(start=0, stop=2))
    data["month"] = pd.to_numeric(data["date"].str.slice(start=3, stop=5))
    data["year"] =  pd.to_numeric(data["date"].str.slice(start=6, stop=10))
    data["time"] = pd.to_numeric(data["date"].str.slice(start=10, stop=13))*60 + pd.to_numeric(data["date"].str.slice(start=14))

    ### date컬럼 삭제
    data.drop("date", axis=1, inplace=True)    
    return data

""" 데이터 전처리 함수 """
def check_data(data):
    ### 결측치 확인 → 없음
    info = data.info()
    ### 중복 데이터 확인 없음
    dup = data.duplicated()
    dup = data[dup]
    ### 이상치 확인
    des = data.describe()
    
    ### boxplot 분석
    plt.figure(figsize=(15, 5))
    data.boxplot()
    plt.show()
    
    data2= data.drop("NSM", axis=1)
    
    plt.figure(figsize=(15, 5))
    data2.boxplot()
    plt.show()
    
    print(f"결측치 : {info} \n 중복값 : {dup} \n 이상치 : {des} \n 모두 이상 없어 보임!!")
    print("boxplot 분석 결과 이상치로 의심되는 데이터이 보이지만 \n 그 양이 많고 촘촘한 편이므로 평균값과 차이가 있을 뿐 \n 이상치라고 판단하기는 어려워보인다.")

 

"""상관관계 분석 및 시각화"""
def analyze_and_visualize(data):

    correlation_matrix = data.corr()

    ### 상관관계 시각화1 - SIEC 히트맵
    plt.figure(figsize=(10,6))
    plt.title("SIEC 히트맵")
    sns.heatmap(correlation_matrix, annot = True, fmt=".3f", linewidths=0.5 ,cmap="coolwarm")
    plt.show()

    print("year 데이터는 상관관계가 보이지 않으므로 제거해도 될꺼 같아보인다")
    data.drop("year", axis=1, inplace=True)

    ## 상관관계 시각화2 - 산점행렬도
    sns.pairplot(data)
    plt.suptitle("산점행렬도", y=1)
    plt.show()

 

"""독립변수, 종속변수 분리"""
def split_independent_dependent(data):
    X = data[[col for col in data.columns if col != "Load_Type"]]
    y = data["Load_Type"]

    return X, y

 

"""상관관계 검증"""
def test_correlation(data, X, y):
    """스피어만 상관관계 검정 - 비선형적, 순서형 데이터"""
    for column in X.columns :
        corr, p_value = spearmanr(X[column], y)
        print(f"{column} vs Load_Type : corr={corr:.4f} / p_value={p_value:.4f}")
        # 통상적으로 사용되는 유의수준 (0.05)을 기준으로 p-value를 검정
        if p_value < 0.05:
            print("귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.")
        else:
            print("귀무가설을 기각할 수 없으며, 두 변수 간에는 통계적으로 유의한 상관관계가 없습니다.")
            
    print("\n스피어만 상관관계 검증을 통해 각 독립변수들의 p-value 값을 확인한 결과 \n각 독립변수와 종속변수의 p-value는 < 0.05 이므로 모두 유의미한 데이터로 생각됩니다.")

 

"""정규화 + 표준화"""
def compare_scaling_methods(X, y):

    scaling_methods = ["StandardScaler", "MinMaxScaler"]
    df_list = []

    for i in scaling_methods:
        
        print(f"[ {i} ]")
    
            ###정규화
        if i == "StandardScaler":
            ss = StandardScaler()
            scaled_X = ss.fit_transform(X)
            
            """훈련 : 검증 : 테스트 데이터 분류하기 = 6 : 2 : 2"""        
            train_X, temp_X, train_y, temp_y = train_test_split(scaled_X, y, test_size=0.4, random_state=42)
            val_X, test_X, val_y, test_y = train_test_split(temp_X, temp_y, test_size=0.5, random_state=42)

            """머신러닝 분류 모델"""
            df_ml_ss = ml_classify_model(train_X, train_y, val_X, val_y, test_X, test_y)

            """DNN"""
            df_dnn_ss = dl_classify_model(train_X, train_y, val_X, val_y, test_X, test_y)
            df_ml_ss = pd.concat([df_ml_ss, df_dnn_ss], axis=0, ignore_index=True)

            df_list.append(df_ml_ss)
            
            ###표준화  
        else:
            mm = MinMaxScaler()
            scaled_X = mm.fit_transform(X)
            
            """훈련 : 검증 : 테스트 데이터 분류하기 = 6 : 2 : 2"""
            train_X, temp_X, train_y, temp_y = train_test_split(scaled_X, y, test_size=0.4, random_state=42)
            val_X, test_X, val_y, test_y = train_test_split(temp_X, temp_y, test_size=0.5, random_state=42)
            
            """머신러닝 분류 모델"""
            df_ml_mm = ml_classify_model(train_X, train_y, val_X, val_y, test_X, test_y)
            
            """DNN"""
            df_dnn_mm = dl_classify_model(train_X, train_y, val_X, val_y, test_X, test_y)
            df_ml_mm = pd.concat([df_ml_mm, df_dnn_mm], axis=0, ignore_index=True)
            df_list.append(df_ml_mm)

    return df_list, train_X, train_y, val_X, val_y, test_X, test_y

 

"""머신러닝 분류모델"""
def ml_classify_model(train_X, train_y, val_X, val_y, test_X, test_y) :
    rf_model = RandomForestClassifier(n_jobs=-1, random_state=42)
    et_model = ExtraTreesClassifier(n_jobs=-1, random_state=42)
    gb_model = GradientBoostingClassifier()
    hg_model = HistGradientBoostingClassifier()
    xgb_model = XGBClassifier(n_jobs=-1, random_state=42)
    mlp_model = MLPClassifier(random_state=42)

    models = [rf_model, et_model, gb_model, hg_model, xgb_model, mlp_model]

    model_name = [ name.__class__.__name__ for name in models]

    df = pd.DataFrame()

    for model, modelName in zip(models, model_name):
        print(f"--------------------------< {modelName} >---------------------")
    
    
        """하이퍼파라메터 매개변수 정의하기-------------------------------------------------"""
        if modelName == "HistGradientBoostingClassifier":
            param_grid = {
                "max_iter"  : [25, 50],
                "max_depth" : [10],
                "min_samples_leaf" : [1, 2, 4]
            }
            
        elif modelName == "XGBClassifier" :
            param_grid = {
                "n_estimators"  : [25, 50],
                "max_depth" : [10],
                "min_child_weight" : [1, 2, 4]
            }
    
        elif modelName == "MLPClassifier" :
            param_grid = {
                "hidden_layer_sizes" : [(10,), (50,), (100,)],
                "alpha" : [0.0001, 0.001, 0.01],
                "max_iter" : [1000]
            }
            
        else :
            param_grid = {
                "n_estimators"  : [25, 50],
                "max_depth" : [10],
                "min_samples_split" : [2, 5],
                "min_samples_leaf" : [1, 2, 4]
            }
    
        """그리드 서치 CV 수행하기 ------------------------------------------------------"""
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring = "accuracy", n_jobs=-1)
        grid_search.fit(train_X, train_y)
    
        
        """최적의 모델 받아오기"""
        model = grid_search.best_estimator_
    
        
        """훈련 및 검증 정확도 확인하기 --------------------------------------------------"""
        
        """정확도를 확인하기 위하여 -> 예측하기 """
        """훈련 및 검증 정확도 확인하기"""
        train_score = model.score(train_X, train_y)
        val_score = model.score(val_X, val_y)
    
        """훈련 및 검증 예측하기"""
        train_pred = model.predict(train_X)
        val_pred = model.predict(val_X)
        
        """정확도 확인하기"""
        train_acc = accuracy_score(train_y, train_pred)
        val_acc = accuracy_score(val_y, val_pred)
    
        
        """최종 테스트 평가하기 -----------------------------------------------------------------------"""
        """ 테스트 데이터로 예측하기"""
        test_pred = model.predict(test_X)
    
        """정확도"""
        test_acc = accuracy_score(test_y, test_pred)
        """정밀도"""
        precision = precision_score(test_y, test_pred, average="micro")
        """재현율"""
        recall = recall_score(test_y, test_pred, average="micro")
        """f1-score"""
        f1 = f1_score(test_y, test_pred, average="micro")
    
        """오차행렬(혼동행렬)"""
        cm = confusion_matrix(test_y, test_pred)
        
        """혼동행렬 시각화"""
        plt.figure(figsize=(8,4))
        plt.title(f"Confusion Matrix - {modelName}")
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=["0","1","2"], yticklabels=["0", "1", "2"])
        plt.savefig(f"./img/confusion_matrix_{modelName}.png")
        # plt.show()
    
        """데이터프레임에 담기--------------------------------------------------------------------------"""
        df_temp = pd.DataFrame({        
            "모델명" : [modelName],
            "훈련정확도" : [train_acc],
            "검증정확도" : [val_acc],
            "테스트정확도" : [test_acc],
            "(훈련-검증)" : [train_acc - val_acc],
            "정밀도" : [precision],
            "재현율" : [recall],
            "f1-score" : [f1],
            "혼동행렬" : [cm]
        })
    
        """하나의 데이터프레임에 추가하기"""
        df = pd.concat([df, df_temp], ignore_index=True)


    return df

 

"""딥러닝 분류모델 - DNN"""
def dl_classify_model(train_X, train_y, val_X, val_y, test_X, test_y):
    print(f"--------------------------< DNN >---------------------")
    model = keras.Sequential()
    
    """계층 생성 및 모델 설정"""
    model.add(keras.layers.Dense(64, activation="relu", input_dim=12))
    model.add(keras.layers.Dense(32, activation="relu"))
    model.add(keras.layers.Dense(3, activation="softmax"))
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    """콜백함수"""
    checkpoint_cb = keras.callbacks.ModelCheckpoint("./model/best_model.h5", save_best_only=True)
    early_stopping_cb = keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)
    
    """모델 훈련"""
    model.fit(train_X, train_y, epochs=100, validation_data=(val_X, val_y), callbacks=[checkpoint_cb, early_stopping_cb])

    
    """최종 테스트 평가하기 -----------------------------------------------------------------------"""
    """ 예측하기 """
    y_pred = model.predict(test_X)
    y_pred = np.argmax(y_pred, axis=1)

    """정확도"""
    test_acc = accuracy_score(test_y, y_pred)
    
    """정밀도"""
    pre = precision_score(test_y, y_pred, average='micro')
    
    """재현율"""
    rec = recall_score(test_y, y_pred, average='micro')
    
    """f1-score"""
    f1 = f1_score(test_y, y_pred, average='micro')
    
    """confusion_matrix"""
    conf = confusion_matrix(test_y, y_pred)
    
    
    df_DNN = pd.DataFrame({        
            "모델명" : "DNN",
            "훈련정확도" : [None],
            "검증정확도" : [None],
            "테스트정확도" : [test_acc],
            "(훈련-검증)" : [None],
            "정밀도" : [pre],
            "재현율" : [rec],
            "f1-score" : [f1],
            "혼동행렬" : [conf]
        })

    """혼동행렬도"""
    sns.heatmap(conf, annot=True, fmt="d", cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig("./img/confusion_matrix.png")
    plt.show()

    return df_DNN

 

'"""메인 실행함수"""
def main():
    # 데이터 파일 경로
    file_path = './data/Steel_industry_data.csv'
    
    print("\n[데이터 불러오기]\n")
    data = load_data(file_path)
    print(data)
    
    print("\n[데이터 전처리]\n")
    check_data(data)
    
    print("\n[상관관계 분석 및 시각화]\n")
    analyze_and_visualize(data)

    """\n[독립변수 종속변수 분리]\n"""
    X, y = split_independent_dependent(data)
    
    print("\n[상관관계 검증]\n")
    test_correlation(data, X, y)

    print("\n[정규화 + 표준화]\n")
    df_list , train_X, train_y, val_X, val_y, test_X, test_y = compare_scaling_methods(X, y)
    print("\n[결과 데이터프레임 리스트]\n")
    for idx, df in enumerate(df_list, 1):
        print(f"DataFrame {idx}:\n")
        print(df)
        print("\n")
    
    print("비교 결과 StandardScaler로 정규화 시켰을 때 모델은 정확도가 제일 높으면서 일반화가 잘 되어있는 HistGradientBoostingClassifier 분류모델이 좋을것 같고 MinMaxScaler로 표준화 시켰을 때 모델 또한 HistGradientBoostingClassifier이 좋아보인다.")
    
    

if __name__ == "__main__":
    main()

 

 

실행결과

 

[데이터 불러오기]

       Usage_kWh  Lagging_Current_Reactive.Power_kVarh  \
0           3.17                                  2.95   
1           4.00                                  4.46   
2           3.24                                  3.28   
3           3.31                                  3.56   
4           3.82                                  4.50   
...          ...                                   ...   
35035       3.85                                  4.86   
35036       3.74                                  3.74   
35037       3.78                                  3.17   
35038       3.78                                  3.06   
35039       3.67                                  3.02   

       Leading_Current_Reactive_Power_kVarh  CO2(tCO2)  \
0                                      0.00        0.0   
1                                      0.00        0.0   
2                                      0.00        0.0   
3                                      0.00        0.0   
4                                      0.00        0.0   
...                                     ...        ...   
35035                                  0.00        0.0   
35036                                  0.00        0.0   
35037                                  0.07        0.0   
35038                                  0.11        0.0   
35039                                  0.07        0.0   

       Lagging_Current_Power_Factor  Leading_Current_Power_Factor    NSM  \
0                             73.21                        100.00    900   
1                             66.77                        100.00   1800   
2                             70.28                        100.00   2700   
3                             68.09                        100.00   3600   
4                             64.72                        100.00   4500   
...                             ...                           ...    ...   
35035                         62.10                        100.00  82800   
35036                         70.71                        100.00  83700   
35037                         76.62                         99.98  84600   
35038                         77.72                         99.96  85500   
35039                         77.22                         99.98      0   

       WeekStatus  Day_of_week  Load_Type  day  month  year  time  
0               0            0          0    1      1  2018    15  
1               0            0          0    1      1  2018    30  
2               0            0          0    1      1  2018    45  
3               0            0          0    1      1  2018    60  
4               0            0          0    1      1  2018    75  
...           ...          ...        ...  ...    ...   ...   ...  
35035           0            0          0   31     12  2018  1380  
35036           0            0          0   31     12  2018  1395  
35037           0            0          0   31     12  2018  1410  
35038           0            0          0   31     12  2018  1425  
35039           0            0          0   31     12  2018     0  

[35040 rows x 14 columns]

[데이터 전처리]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35040 entries, 0 to 35039
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Usage_kWh                             35040 non-null  float64
 1   Lagging_Current_Reactive.Power_kVarh  35040 non-null  float64
 2   Leading_Current_Reactive_Power_kVarh  35040 non-null  float64
 3   CO2(tCO2)                             35040 non-null  float64
 4   Lagging_Current_Power_Factor          35040 non-null  float64
 5   Leading_Current_Power_Factor          35040 non-null  float64
 6   NSM                                   35040 non-null  int64  
 7   WeekStatus                            35040 non-null  int64  
 8   Day_of_week                           35040 non-null  int64  
 9   Load_Type                             35040 non-null  int64  
 10  day                                   35040 non-null  int64  
 11  month                                 35040 non-null  int64  
 12  year                                  35040 non-null  int64  
 13  time                                  35040 non-null  int64  
dtypes: float64(6), int64(8)
memory usage: 3.7 MB

 

결측치 : None 
 중복값 : Empty DataFrame
Columns: [Usage_kWh, Lagging_Current_Reactive.Power_kVarh, Leading_Current_Reactive_Power_kVarh, CO2(tCO2), Lagging_Current_Power_Factor, Leading_Current_Power_Factor, NSM, WeekStatus, Day_of_week, Load_Type, day, month, year, time]
Index: [] 
 이상치 :           Usage_kWh  Lagging_Current_Reactive.Power_kVarh  \
count  35040.000000                          35040.000000   
mean      27.386892                             13.035384   
std       33.444380                             16.306000   
min        0.000000                              0.000000   
25%        3.200000                              2.300000   
50%        4.570000                              5.000000   
75%       51.237500                             22.640000   
max      157.180000                             96.910000   

       Leading_Current_Reactive_Power_kVarh     CO2(tCO2)  \
count                          35040.000000  35040.000000   
mean                               3.870949      0.011524   
std                                7.424463      0.016151   
min                                0.000000      0.000000   
25%                                0.000000      0.000000   
50%                                0.000000      0.000000   
75%                                2.090000      0.020000   
max                               27.760000      0.070000   

       Lagging_Current_Power_Factor  Leading_Current_Power_Factor  \
count                  35040.000000                  35040.000000   
mean                      80.578056                     84.367870   
std                       18.921322                     30.456535   
min                        0.000000                      0.000000   
25%                       63.320000                     99.700000   
50%                       87.960000                    100.000000   
75%                       99.022500                    100.000000   
max                      100.000000                    100.000000   

                NSM    WeekStatus   Day_of_week     Load_Type           day  \
count  35040.000000  35040.000000  35040.000000  35040.000000  35040.000000   
mean   42750.000000      0.284932      2.991781      0.691781     15.720548   
std    24940.534317      0.451388      2.003433      0.792658      8.796373   
min        0.000000      0.000000      0.000000      0.000000      1.000000   
25%    21375.000000      0.000000      1.000000      0.000000      8.000000   
50%    42750.000000      0.000000      3.000000      0.000000     16.000000   
75%    64125.000000      1.000000      5.000000      1.000000     23.000000   
max    85500.000000      1.000000      6.000000      2.000000     31.000000   

              month     year          time  
count  35040.000000  35040.0  35040.000000  
mean       6.526027   2018.0    712.500000  
std        3.447901      0.0    415.675572  
min        1.000000   2018.0      0.000000  
25%        4.000000   2018.0    356.250000  
50%        7.000000   2018.0    712.500000  
75%       10.000000   2018.0   1068.750000  
max       12.000000   2018.0   1425.000000   
 모두 이상 없어 보임!!
boxplot 분석 결과 이상치로 의심되는 데이터이 보이지만 
 그 양이 많고 촘촘한 편이므로 평균값과 차이가 있을 뿐 
 이상치라고 판단하기는 어려워보인다.

[상관관계 분석 및 시각화]

 

 

year 데이터는 상관관계가 보이지 않으므로 제거해도 될꺼 같아보인다

[상관관계 검증]

Usage_kWh vs Load_Type : corr=0.6964 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
Lagging_Current_Reactive.Power_kVarh vs Load_Type : corr=0.3602 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
Leading_Current_Reactive_Power_kVarh vs Load_Type : corr=0.0835 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
CO2(tCO2) vs Load_Type : corr=0.6656 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
Lagging_Current_Power_Factor vs Load_Type : corr=0.4548 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
Leading_Current_Power_Factor vs Load_Type : corr=-0.0280 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
NSM vs Load_Type : corr=0.5255 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
WeekStatus vs Load_Type : corr=-0.2106 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
Day_of_week vs Load_Type : corr=-0.1778 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
day vs Load_Type : corr=0.0149 / p_value=0.0054
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
month vs Load_Type : corr=-0.0137 / p_value=0.0104
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.
time vs Load_Type : corr=0.5255 / p_value=0.0000
귀무가설을 기각하며, 두 변수 간에는 통계적으로 유의한 상관관계가 존재합니다.

스피어만 상관관계 검증을 통해 각 독립변수들의 p-value 값을 확인한 결과 
각 독립변수와 종속변수의 p-value는 < 0.05 이므로 모두 유의미한 데이터로 생각됩니다.

[정규화 + 표준화]

[ StandardScaler ]
--------------------------< RandomForestClassifier >---------------------
--------------------------< ExtraTreesClassifier >---------------------
--------------------------< GradientBoostingClassifier >---------------------
--------------------------< HistGradientBoostingClassifier >---------------------
--------------------------< XGBClassifier >---------------------
--------------------------< MLPClassifier >---------------------
Epoch 1/100
657/657 [==============================] - 3s 3ms/step - loss: 0.4841 - accuracy: 0.7754 - val_loss: 0.3974 - val_accuracy: 0.8122
Epoch 2/100
657/657 [==============================] - 2s 3ms/step - loss: 0.3592 - accuracy: 0.8372 - val_loss: 0.3263 - val_accuracy: 0.8557
Epoch 3/100
657/657 [==============================] - 2s 3ms/step - loss: 0.3005 - accuracy: 0.8633 - val_loss: 0.2811 - val_accuracy: 0.8760
Epoch 4/100
657/657 [==============================] - 2s 3ms/step - loss: 0.2646 - accuracy: 0.8819 - val_loss: 0.2754 - val_accuracy: 0.8723
Epoch 5/100
657/657 [==============================] - 2s 4ms/step - loss: 0.2396 - accuracy: 0.8964 - val_loss: 0.2424 - val_accuracy: 0.8945
Epoch 6/100
657/657 [==============================] - 2s 3ms/step - loss: 0.2190 - accuracy: 0.9084 - val_loss: 0.2232 - val_accuracy: 0.9004
Epoch 7/100
657/657 [==============================] - 3s 4ms/step - loss: 0.2044 - accuracy: 0.9152 - val_loss: 0.2123 - val_accuracy: 0.9081
Epoch 8/100
657/657 [==============================] - 2s 4ms/step - loss: 0.1948 - accuracy: 0.9187 - val_loss: 0.1941 - val_accuracy: 0.9208
Epoch 9/100
657/657 [==============================] - 3s 4ms/step - loss: 0.1835 - accuracy: 0.9250 - val_loss: 0.1789 - val_accuracy: 0.9298
Epoch 10/100
657/657 [==============================] - 2s 3ms/step - loss: 0.1731 - accuracy: 0.9287 - val_loss: 0.1792 - val_accuracy: 0.9247
Epoch 11/100
657/657 [==============================] - 2s 3ms/step - loss: 0.1683 - accuracy: 0.9320 - val_loss: 0.1749 - val_accuracy: 0.9248
Epoch 12/100
657/657 [==============================] - 2s 3ms/step - loss: 0.1594 - accuracy: 0.9364 - val_loss: 0.1739 - val_accuracy: 0.9269
Epoch 13/100
657/657 [==============================] - 1s 2ms/step - loss: 0.1541 - accuracy: 0.9392 - val_loss: 0.1639 - val_accuracy: 0.9334
Epoch 14/100
657/657 [==============================] - 1s 2ms/step - loss: 0.1481 - accuracy: 0.9404 - val_loss: 0.1499 - val_accuracy: 0.9408
Epoch 15/100
657/657 [==============================] - 1s 2ms/step - loss: 0.1427 - accuracy: 0.9436 - val_loss: 0.1545 - val_accuracy: 0.9369
Epoch 16/100
657/657 [==============================] - 1s 2ms/step - loss: 0.1397 - accuracy: 0.9443 - val_loss: 0.1671 - val_accuracy: 0.9274

 

[ MinMaxScaler ]
--------------------------< RandomForestClassifier >---------------------
--------------------------< ExtraTreesClassifier >---------------------
--------------------------< GradientBoostingClassifier >---------------------
--------------------------< HistGradientBoostingClassifier >---------------------
--------------------------< XGBClassifier >---------------------
--------------------------< MLPClassifier >---------------------
Epoch 1/100
657/657 [==============================] - 1s 863us/step - loss: 0.5828 - accuracy: 0.7333 - val_loss: 0.4731 - val_accuracy: 0.7928
Epoch 2/100
657/657 [==============================] - 1s 796us/step - loss: 0.4549 - accuracy: 0.7887 - val_loss: 0.4384 - val_accuracy: 0.8034
Epoch 3/100
657/657 [==============================] - 1s 793us/step - loss: 0.4217 - accuracy: 0.8027 - val_loss: 0.4000 - val_accuracy: 0.8212
Epoch 4/100
657/657 [==============================] - 0s 724us/step - loss: 0.3906 - accuracy: 0.8232 - val_loss: 0.4063 - val_accuracy: 0.8225
Epoch 5/100
657/657 [==============================] - 1s 783us/step - loss: 0.3612 - accuracy: 0.8397 - val_loss: 0.3526 - val_accuracy: 0.8398
Epoch 6/100
657/657 [==============================] - 1s 786us/step - loss: 0.3343 - accuracy: 0.8473 - val_loss: 0.3271 - val_accuracy: 0.8527
Epoch 7/100
657/657 [==============================] - 1s 789us/step - loss: 0.3153 - accuracy: 0.8543 - val_loss: 0.3262 - val_accuracy: 0.8522
Epoch 8/100
657/657 [==============================] - 1s 766us/step - loss: 0.2991 - accuracy: 0.8625 - val_loss: 0.2911 - val_accuracy: 0.8663
Epoch 9/100
657/657 [==============================] - 1s 800us/step - loss: 0.2850 - accuracy: 0.8708 - val_loss: 0.2757 - val_accuracy: 0.8783
Epoch 10/100
657/657 [==============================] - 1s 782us/step - loss: 0.2717 - accuracy: 0.8760 - val_loss: 0.2729 - val_accuracy: 0.8760
Epoch 11/100
657/657 [==============================] - 1s 793us/step - loss: 0.2630 - accuracy: 0.8833 - val_loss: 0.2626 - val_accuracy: 0.8838
Epoch 12/100
657/657 [==============================] - 1s 788us/step - loss: 0.2530 - accuracy: 0.8872 - val_loss: 0.2480 - val_accuracy: 0.8843
Epoch 13/100
657/657 [==============================] - 1s 803us/step - loss: 0.2466 - accuracy: 0.8897 - val_loss: 0.2472 - val_accuracy: 0.8920
Epoch 14/100
657/657 [==============================] - 1s 770us/step - loss: 0.2397 - accuracy: 0.8967 - val_loss: 0.2419 - val_accuracy: 0.8931
Epoch 15/100
657/657 [==============================] - 1s 779us/step - loss: 0.2328 - accuracy: 0.9000 - val_loss: 0.2354 - val_accuracy: 0.8968
Epoch 16/100
657/657 [==============================] - 1s 768us/step - loss: 0.2303 - accuracy: 0.8991 - val_loss: 0.2483 - val_accuracy: 0.8807
Epoch 17/100
657/657 [==============================] - 1s 783us/step - loss: 0.2241 - accuracy: 0.9039 - val_loss: 0.2347 - val_accuracy: 0.8965
Epoch 18/100
657/657 [==============================] - 1s 798us/step - loss: 0.2197 - accuracy: 0.9072 - val_loss: 0.2195 - val_accuracy: 0.9055
Epoch 19/100
657/657 [==============================] - 1s 759us/step - loss: 0.2178 - accuracy: 0.9065 - val_loss: 0.2483 - val_accuracy: 0.8864
Epoch 20/100
657/657 [==============================] - 1s 793us/step - loss: 0.2108 - accuracy: 0.9105 - val_loss: 0.2141 - val_accuracy: 0.9110
Epoch 21/100
657/657 [==============================] - 1s 810us/step - loss: 0.2093 - accuracy: 0.9126 - val_loss: 0.2051 - val_accuracy: 0.9142
Epoch 22/100
657/657 [==============================] - 0s 755us/step - loss: 0.2045 - accuracy: 0.9167 - val_loss: 0.2224 - val_accuracy: 0.9055
Epoch 23/100
657/657 [==============================] - 0s 743us/step - loss: 0.2043 - accuracy: 0.9150 - val_loss: 0.2198 - val_accuracy: 0.9004

 

 

[결과 데이터프레임 리스트]

DataFrame 1:

                              모델명     훈련정확도     검증정확도    테스트정확도   (훈련-검증)  \
0          RandomForestClassifier  0.985683  0.975742  0.977312  0.009941   
1            ExtraTreesClassifier  0.907344  0.894264  0.887985  0.013080   
2      GradientBoostingClassifier  1.000000  0.996005  0.996005  0.003995   
3  HistGradientBoostingClassifier  0.999810  0.997860  0.998858  0.001950   
4                   XGBClassifier  1.000000  0.998002  0.998288  0.001998   
5                   MLPClassifier  0.972888  0.961045  0.957049  0.011844   
6                             DNN       NaN       NaN  0.933790       NaN   

        정밀도       재현율  f1-score  \
0  0.977312  0.977312  0.977312   
1  0.887985  0.887985  0.887985   
2  0.996005  0.996005  0.996005   
3  0.998858  0.998858  0.998858   
4  0.998288  0.998288  0.998288   
5  0.957049  0.957049  0.957049   
6  0.933790  0.933790  0.933790   

                                                혼동행렬  
0    [[3572, 32, 7], [58, 1865, 22], [18, 22, 1412]]  
1  [[3532, 76, 3], [133, 1514, 298], [48, 227, 11...  
2         [[3594, 8, 9], [4, 1940, 1], [6, 0, 1446]]  
3         [[3605, 1, 5], [0, 1945, 0], [2, 0, 1450]]  
4         [[3602, 2, 7], [1, 1944, 0], [2, 0, 1450]]  
5   [[3537, 62, 12], [69, 1811, 65], [30, 63, 1359]]  
6  [[3476, 120, 15], [112, 1749, 84], [51, 82, 13...  


DataFrame 2:

                              모델명     훈련정확도     검증정확도    테스트정확도   (훈련-검증)  \
0          RandomForestClassifier  0.985683  0.975885  0.977454  0.009798   
1            ExtraTreesClassifier  0.907344  0.894264  0.887985  0.013080   
2      GradientBoostingClassifier  1.000000  0.996147  0.995719  0.003853   
3  HistGradientBoostingClassifier  0.999715  0.998716  0.998573  0.000999   
4                   XGBClassifier  1.000000  0.998002  0.998288  0.001998   
5                   MLPClassifier  0.958524  0.954481  0.947631  0.004043   
6                             DNN       NaN       NaN  0.911387       NaN   

        정밀도       재현율  f1-score  \
0  0.977454  0.977454  0.977454   
1  0.887985  0.887985  0.887985   
2  0.995719  0.995719  0.995719   
3  0.998573  0.998573  0.998573   
4  0.998288  0.998288  0.998288   
5  0.947631  0.947631  0.947631   
6  0.911387  0.911387  0.911387   

                                                혼동행렬  
0    [[3571, 33, 7], [58, 1867, 20], [18, 22, 1412]]  
1  [[3532, 76, 3], [133, 1514, 298], [48, 227, 11...  
2        [[3592, 11, 8], [3, 1941, 1], [7, 0, 1445]]  
3         [[3603, 2, 6], [0, 1945, 0], [2, 0, 1450]]  
4         [[3602, 2, 7], [1, 1944, 0], [2, 0, 1450]]  
5   [[3505, 81, 25], [88, 1759, 98], [29, 46, 1377]]  
6  [[3443, 140, 28], [143, 1648, 154], [35, 121, ...  


비교 결과 StandardScaler로 정규화 시켰을 때 모델은 정확도가 제일 높으면서 일반화가 잘 되어있는 HistGradientBoostingClassifier 분류모델이 좋을것 같고 MinMaxScaler로 표준화 시켰을 때 모델 또한 HistGradientBoostingClassifier이 좋아보인다.

728x90