1
1
import numpy as np
2
2
import pandas as pd
3
- import sklearn
4
- import sklearn .feature_selection
5
3
from PyImpetus import PPIMBC
6
4
from sklearn .svm import SVC
7
- import pandas as pd
5
+ from sklearn . feature_selection import f_classif
8
6
9
7
class feature_methods :
10
8
11
9
def __init__ (self ):
12
10
"""set 100% for all, if not 100 then pass to function, always % of n input features. Calculate dynamically."""
13
11
12
+
13
+
14
14
def getNfeaturesANOVAF (self , n , X_train , y_train ):
15
15
"""
16
16
Get the top n features based on the ANOVA F-value
@@ -37,37 +37,36 @@ def getNfeaturesANOVAF(self, n, X_train, y_train):
37
37
are used, otherwise the column indices are used.
38
38
"""
39
39
40
- # check if input is a pandas DataFrame or numpy array
40
+ # Check if input is a pandas DataFrame or numpy array
41
41
if isinstance (X_train , pd .DataFrame ):
42
- feature_names = X_train .columns # get column names
43
- X_train = X_train .values # convert to numpy array
42
+ feature_names = X_train .columns # Get column names
43
+ X_train = X_train .values # Convert to numpy array
44
44
elif isinstance (X_train , np .ndarray ):
45
- feature_names = np .arange (X_train .shape [1 ]) # use indices as column names
45
+ feature_names = np .arange (X_train .shape [1 ]) # Use indices as column names
46
46
else :
47
47
raise ValueError ("X_train must be a pandas DataFrame or numpy array" )
48
48
49
- # calculate F-value for each column in X_train
50
- # F-value is calculated by sklearn.feature_selection.f_classif
51
- # input is a 2D numpy array and target variable y_train
52
- # output is a 1D numpy array of F-values
49
+ # Calculate F-value for each column in X_train
53
50
res = []
54
51
for i , col in enumerate (X_train .T ):
55
- res .append (
56
- (
57
- feature_names [i ], # add column name or index to tuple
58
- sklearn .feature_selection .f_classif (col .reshape (- 1 , 1 ), y_train )[0 ],
59
- )
60
- )
61
-
62
- # sort the list based on F-value in descending order
63
- sortedList = sorted (res , key = lambda X : X [1 ], reverse = True )
64
- print (sortedList )
65
- # return column names of top n features
66
- nFeatures = sortedList [:n ] # get top n features
67
- finalColNames = [elem [0 ] for elem in nFeatures ] # get column names
52
+ # Get the F-values from f_classif
53
+ f_values = f_classif (col .reshape (- 1 , 1 ), y_train )[0 ]
54
+
55
+ # If the F-value is not NaN, add it to the results
56
+ if not np .isnan (f_values [0 ]):
57
+ res .append ((feature_names [i ], f_values [0 ]))
58
+
59
+ # Sort the list based on F-value in descending order
60
+ sortedList = sorted (res , key = lambda x : x [1 ], reverse = True )
61
+
62
+ # Return column names of top n features
63
+ nFeatures = sortedList [:n ] # Get top n features
64
+ finalColNames = [elem [0 ] for elem in nFeatures ] # Get column names
65
+
68
66
return finalColNames
69
67
70
68
69
+
71
70
def getNFeaturesMarkovBlanket (self , n , X_train , y_train ):
72
71
73
72
"""
0 commit comments