Skip to content

Commit 73240c9

Browse files
committed
added nan catch for feature selection in anova method to prevent propagation of constant columns
1 parent 60d3b8e commit 73240c9

File tree

1 file changed

+23
-24
lines changed

1 file changed

+23
-24
lines changed

ml_grid/pipeline/data_feature_methods.py

+23-24
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
import numpy as np
22
import pandas as pd
3-
import sklearn
4-
import sklearn.feature_selection
53
from PyImpetus import PPIMBC
64
from sklearn.svm import SVC
7-
import pandas as pd
5+
from sklearn.feature_selection import f_classif
86

97
class feature_methods:
108

119
def __init__(self):
1210
"""set 100% for all, if not 100 then pass to function, always % of n input features. Calculate dynamically."""
1311

12+
13+
1414
def getNfeaturesANOVAF(self, n, X_train, y_train):
1515
"""
1616
Get the top n features based on the ANOVA F-value
@@ -37,37 +37,36 @@ def getNfeaturesANOVAF(self, n, X_train, y_train):
3737
are used, otherwise the column indices are used.
3838
"""
3939

40-
# check if input is a pandas DataFrame or numpy array
40+
# Check if input is a pandas DataFrame or numpy array
4141
if isinstance(X_train, pd.DataFrame):
42-
feature_names = X_train.columns # get column names
43-
X_train = X_train.values # convert to numpy array
42+
feature_names = X_train.columns # Get column names
43+
X_train = X_train.values # Convert to numpy array
4444
elif isinstance(X_train, np.ndarray):
45-
feature_names = np.arange(X_train.shape[1]) # use indices as column names
45+
feature_names = np.arange(X_train.shape[1]) # Use indices as column names
4646
else:
4747
raise ValueError("X_train must be a pandas DataFrame or numpy array")
4848

49-
# calculate F-value for each column in X_train
50-
# F-value is calculated by sklearn.feature_selection.f_classif
51-
# input is a 2D numpy array and target variable y_train
52-
# output is a 1D numpy array of F-values
49+
# Calculate F-value for each column in X_train
5350
res = []
5451
for i, col in enumerate(X_train.T):
55-
res.append(
56-
(
57-
feature_names[i], # add column name or index to tuple
58-
sklearn.feature_selection.f_classif(col.reshape(-1, 1), y_train)[0],
59-
)
60-
)
61-
62-
# sort the list based on F-value in descending order
63-
sortedList = sorted(res, key=lambda X: X[1], reverse=True)
64-
print(sortedList)
65-
# return column names of top n features
66-
nFeatures = sortedList[:n] # get top n features
67-
finalColNames = [elem[0] for elem in nFeatures] # get column names
52+
# Get the F-values from f_classif
53+
f_values = f_classif(col.reshape(-1, 1), y_train)[0]
54+
55+
# If the F-value is not NaN, add it to the results
56+
if not np.isnan(f_values[0]):
57+
res.append((feature_names[i], f_values[0]))
58+
59+
# Sort the list based on F-value in descending order
60+
sortedList = sorted(res, key=lambda x: x[1], reverse=True)
61+
62+
# Return column names of top n features
63+
nFeatures = sortedList[:n] # Get top n features
64+
finalColNames = [elem[0] for elem in nFeatures] # Get column names
65+
6866
return finalColNames
6967

7068

69+
7170
def getNFeaturesMarkovBlanket(self, n, X_train, y_train):
7271

7372
"""

0 commit comments

Comments
 (0)