-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml_cls_smote_04.py
107 lines (96 loc) · 4.75 KB
/
ml_cls_smote_04.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
# Created by "Thieu" at 16:39, 04/04/2024 ----------%
# Email: [email protected] %
# Github: https://github.com/thieu1995 %
# --------------------------------------------------%
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from utils.data_util import divide_dataset_classification, scale_dataset_classification
from utils.result_util import save_classification_results
from config import Config, Const
from utils.feature_util import select_cls_features
data = pd.read_csv("data/input_data/inflow_by_mean.csv")
X = data[['value-1', 'value-2', 'value-3', 'value-4', 'value-5', 'value-6', 'value-7',
'value-8', 'value-9', 'value-10', 'value-11', 'value-12']].values
y_raw = data[["label", "month"]].values
lb_encoder = LabelEncoder()
y_clean = lb_encoder.fit_transform(y_raw[:, 0])
## Divide dataset
x_train, x_test, y_train, y_test = divide_dataset_classification(X, y_clean, test_size=Config.TEST_SIZE)
## Scale dataset
X_train_scaled, X_test_scaled, y_train, y_test, scaler_X, _ = scale_dataset_classification(x_train, x_test, y_train, y_test, scaler="std", fix_imbalanced=True)
## Select features
# 0.4 - 1, 3
key_features = "MC-FS"
selected_features_idx, selected_features_score = select_cls_features(X_train_scaled, y_train, mi_weight=0.33, anova_weight=0, dt_weight=0, rf_weight=0.33, svm_weight=0.33)
X_train_scaled = X_train_scaled[:, selected_features_idx[selected_features_score>0.4]]
X_test_scaled = X_test_scaled[:, selected_features_idx[selected_features_score>0.4]]
print(selected_features_idx)
print(selected_features_score)
## Build models
list_models = [
{
"name": "RF",
"model": RandomForestClassifier(random_state=Config.SEED),
"param_grid": Config.MCFS_RF_GRID_CLS
}, {
"name": "SVM",
"model": SVC(probability=True, random_state=Config.SEED),
"param_grid": Config.MCFS_SVM_GRID_CLS
}, {
"name": "LR",
"model": LogisticRegression(random_state=Config.SEED),
"param_grid": Config.MCFS_LR_GRID_CLS
}, {
"name": "KNN",
"model": KNeighborsClassifier(),
"param_grid": Config.MCFS_KNN_GRID_CLS
}, {
"name": "DT",
"model": DecisionTreeClassifier(random_state=Config.SEED),
"param_grid": Config.MCFS_DT_GRID_CLS
}, {
"name": "AdaBoost",
"model": AdaBoostClassifier(random_state=Config.SEED),
"param_grid": Config.MCFS_AdaBoost_GRID_CLS
}, {
"name": "MLP",
"model": MLPClassifier(random_state=Config.SEED),
"param_grid": Config.MCFS_MLP_GRID_CLS
}
]
for idx_model, model in enumerate(list_models):
grid = GridSearchCV(model['model'], model['param_grid'], refit=True, verbose=0, n_jobs=8, scoring="f1_macro")
grid.fit(X_train_scaled, y_train)
mm0 = {
"features": key_features,
"model": model['name'],
"best_params": grid.best_params_,
"best_estimator": grid.best_estimator_
}
y_train_pred = grid.predict(X_train_scaled)
y_test_pred = grid.predict(X_test_scaled)
results = {
Const.Y_TRAIN_TRUE_SCALED: y_train, # 0 and 1
Const.Y_TRAIN_TRUE_UNSCALED: lb_encoder.inverse_transform(y_train), # categorical string
Const.Y_TRAIN_PRED_SCALED: y_train_pred, # 0 and 1
Const.Y_TRAIN_PRED_UNSCALED: lb_encoder.inverse_transform(y_train_pred), # categorical string
Const.Y_TEST_TRUE_SCALED: y_test,
Const.Y_TEST_TRUE_UNSCALED: lb_encoder.inverse_transform(y_test),
Const.Y_TEST_PRED_SCALED: y_test_pred,
Const.Y_TEST_PRED_UNSCALED: lb_encoder.inverse_transform(y_test_pred),
Const.Y_TRAIN_PRED_PROB: grid.predict_proba(X_train_scaled),
Const.Y_TEST_PRED_PROB: grid.predict_proba(X_test_scaled),
}
save_classification_results(results=results, validation=Config.VALIDATION_USED, metrics_head=mm0, metrics_file="metrics-results-cls",
test_filename=f"{model['name']}",
pathsave=f"{Config.DATA_RESULTS_CLS_SMOTE}/{key_features}",
name_labels=lb_encoder.classes_, name_model=model['name'], n_labels=len(lb_encoder.classes_),
loss_train=None, system=None, verbose=False, draw_auc=True)