-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexecution_functions.py
727 lines (606 loc) · 29 KB
/
execution_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import (
roc_auc_score,
average_precision_score,
make_scorer,
mean_absolute_error
)
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from utils.utils_preprocessing import encode_categorical_features, CustomLabelEncoder, ImputeMissing
from params_config import (
fast_cb_bin_params,
fast_cb_regression_params,
fast_cb_multi_params,
rf_params
)
# Import preprocessing utilities
from utils.utils_preprocessing import (
encode_categorical_features, kmeans_discretize, ImputeMissing, target_encode_variables
)
from utils.utils_methods import get_predictions
# Import feature selection methods (same imports as original)
from feature_selection_methods.filter.bivariate.chi_squared import ChiSquaredSelector
from feature_selection_methods.filter.bivariate.correlation import CorrelationSelector
from feature_selection_methods.filter.bivariate.information_value import WOEInformationValueSelector
from feature_selection_methods.filter.bivariate.norm_mutual_info import NormalizedMutualInfoSelector
from feature_selection_methods.filter.multivariate.mrmr import MRMRSelector
from feature_selection_methods.filter.multivariate.relief_algorithms import ReliefSelector
from feature_selection_methods.filter.multivariate.fcbf import FCBFSelector
from feature_selection_methods.wrapper.backward_elimination.sequential_backward_selection import SeqBackSelectorCV
from feature_selection_methods.wrapper.forward_selection.sequential_forward_selection import SeqForwardSelectorCV
from feature_selection_methods.wrapper.bidirectional.sequential_forward_floating_selection import SeqForwardFloatingSelectorCV
from feature_selection_methods.wrapper.bidirectional.sequential_backward_floating_selection import SeqBackFloatingSelectorCV
from feature_selection_methods.hybrid.embedded_wrapper.recursive_feature_elimination import HybridRFE
from feature_selection_methods.hybrid.filter_wrapper.fcbf_sfs import HybridFcbfSfs
from feature_selection_methods.hybrid.filter_wrapper.nmi_sfs import HybridNmiSfs
from feature_selection_methods.hybrid.advanced_wrapper.shap_sfs import HybridShapSfs
from feature_selection_methods.embedded.importance.permutation_feature_importance import PermutationImportanceSelector
from feature_selection_methods.embedded.importance.rf_feature_importances import RFFearureImportanceSelector
from feature_selection_methods.embedded.importance.cb_feature_importances import CatBoostFeatureImportanceSelector
from feature_selection_methods.advanced.shap import ShapFeatureImportanceSelector
from feature_selection_methods.advanced.boruta import CatBoostBoruta
def evaluate_synthetic_dataset(X_selected, ground_truth_features, interactive_features, informative_features, feature_importances, execution_time, X_original):
"""
Evaluate feature selection on synthetic datasets using weighted accuracy.
Parameters:
-----------
X_selected : pd.DataFrame
DataFrame containing only the selected features
ground_truth_features : list
List of feature names that are truly relevant
interactive_features : list
List of interactive features
informative_features : list
List of informative features
feature_importances : dict
Dictionary mapping ground truth features to their importance (0-100)
execution_time : float
Time taken to run the feature selection
X_original : pd.DataFrame
Original DataFrame with all features
Returns:
--------
dict
Dictionary containing evaluation metrics
"""
selected_features = set(X_selected.columns)
ground_truth = set(ground_truth_features)
all_features = set(X_original.columns)
# Split evaluation into ground truth and non-ground truth parts
n_total_features = len(all_features)
n_ground_truth = len(ground_truth)
n_non_ground_truth = n_total_features - n_ground_truth
# Calculate weights for both parts
ground_truth_weight = n_ground_truth / n_total_features
non_ground_truth_weight = n_non_ground_truth / n_total_features
# Evaluate ground truth features (weighted by importance)
ground_truth_score = 0
total_importance = sum(feature_importances.values()) # Should be 100
# True Positives: Selected features that are actually ground truth
for feature in ground_truth:
if feature in selected_features:
ground_truth_score += feature_importances[feature] / total_importance
# Evaluate non-ground truth features
non_ground_truth_features = all_features - ground_truth
# True Negatives: Correctly not selected non-ground truth features
true_negatives = len(non_ground_truth_features - selected_features)
# Calculate non-ground truth score accounting for both TN and FP
if n_non_ground_truth > 0:
non_ground_truth_accuracy = true_negatives / n_non_ground_truth
else:
non_ground_truth_accuracy = 0
# Combine both parts with their respective weights
weighted_accuracy = (
(ground_truth_weight * ground_truth_score) +
(non_ground_truth_weight * non_ground_truth_accuracy)
)
# Calculate traditional accuracy for comparison
traditional_accuracy = (
len(selected_features & ground_truth) +
len(all_features - selected_features - ground_truth)
) / n_total_features
# Calculate metrics for interactive and informative features
n_interactive_total = len(interactive_features)
# Check if there are any interactive features
if n_interactive_total > 0:
selected_interactive = selected_features.intersection(interactive_features)
interactive_selection_rate = len(selected_interactive) / n_interactive_total
else:
interactive_selection_rate = 'N/A'
# For informative features, exclude interactive features to avoid double counting
n_informative_total = len(informative_features)
selected_informative = selected_features.intersection(informative_features)
informative_selection_rate = len(selected_informative) / n_informative_total if n_informative_total > 0 else 0
return {
'weighted_accuracy': weighted_accuracy,
'ground_truth_accuracy': ground_truth_score,
'non_ground_truth_accuracy': non_ground_truth_accuracy,
'interactive_selection_rate': interactive_selection_rate,
'informative_selection_rate': informative_selection_rate,
'execution_time': execution_time,
'n_selected_features': len(selected_features)
}
def get_model_and_scorer(problem_type, balanced_status, categorical_features, n_samples):
"""
Create an appropriately configured CatBoost model and scoring metric based on the problem type.
This function configures CatBoost models with appropriate parameters and scoring metrics
for different machine learning tasks (regression, binary and multiclass classification).
It handles both balanced and imbalanced datasets.
Args:
problem_type (str): Type of machine learning problem - 'regression',
'binary_classification', or 'multiclass_classification'
balanced_status (str): Whether the dataset is 'Balanced' or 'Imbalanced'
categorical_features (list): List of categorical feature column names
n_samples (int): Number of samples in the dataset, used to set min_data_in_leaf
Returns:
tuple: Contains:
- model: Configured CatBoost model instance
- params: Dictionary of model parameters
- scorer: Sklearn scorer object with appropriate metric
- cv: Cross-validation splitter (KFold or StratifiedKFold)
- str: Direction of optimization ('minimize' or 'maximize')
"""
# Set base parameters common to all model types
# min_data_in_leaf is set to 1% of 2/3 of total samples (training set size)
base_params = {
'min_data_in_leaf': round(n_samples * 2/3 * 0.01),
'cat_features': categorical_features
}
# Handle regression case
if problem_type == 'regression':
model = CatBoostRegressor(**fast_cb_regression_params, **base_params)
scorer = make_scorer(mean_absolute_error, greater_is_better=False)
cv = KFold(n_splits=3, shuffle=True, random_state=42)
params = fast_cb_regression_params
return model, params, scorer, cv, 'minimize'
# For classification problems, use stratified cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# Handle binary classification
if problem_type == 'binary_classification':
model = CatBoostClassifier(**fast_cb_bin_params, **base_params)
# Use average precision for imbalanced data, ROC-AUC for balanced
if balanced_status == 'Imbalanced':
scorer = make_scorer(average_precision_score, needs_proba=True)
else:
scorer = make_scorer(roc_auc_score, needs_proba=True)
params = fast_cb_bin_params
# Handle multiclass classification
else:
model = CatBoostClassifier(**fast_cb_multi_params, **base_params)
# Use average precision for imbalanced data, ROC-AUC (one-vs-rest) for balanced
if balanced_status == 'Imbalanced':
scorer = make_scorer(average_precision_score, needs_proba=True)
else:
scorer = make_scorer(roc_auc_score, multi_class='ovr', needs_proba=True)
params = fast_cb_multi_params
return model, params, scorer, cv, 'maximize'
def evaluate_real_dataset(X_train_selected, X_test, y_train, y_test, problem_type, execution_time, categorical_features, balanced_status):
"""
Evaluate feature selection on real-world datasets using the test set.
Parameters:
-----------
X_train_selected : pd.DataFrame
Training data with selected features
X_test : pd.DataFrame
Test data with selected features (already filtered to selected features)
y_train : array-like
Target variable for training
y_test : array-like
Target variable for test set
problem_type : str
Type of problem ('regression', 'binary_classification', 'multiclass_classification')
execution_time : float
Time taken by feature selection
categorical_features : list
List of categorical feature names
balanced_status : str
Whether the dataset is 'Balanced' or 'Imbalanced'
Returns:
--------
dict
Dictionary containing evaluation metrics
"""
# Check if any features were selected
if X_train_selected.empty or len(X_train_selected.columns) == 0:
if problem_type == 'regression':
mean_prediction = y_train.mean()
worst_mae = np.mean(np.abs(y_test - mean_prediction))
return {
'models_performance': {'catboost': worst_mae, 'random_forest': worst_mae},
'execution_time': execution_time,
'n_selected_features': 0
}
else:
return {
'models_performance': {'catboost': 0, 'random_forest': 0},
'execution_time': execution_time,
'n_selected_features': 0
}
# Filter categorical features to only include those present in X_train_selected
valid_cat_features = [col for col in categorical_features if col in X_train_selected.columns]
# Get appropriate scorer
_, params, scorer, _, _ = get_model_and_scorer(
problem_type,
balanced_status,
valid_cat_features,
X_train_selected.shape[0]
)
model_scores = {'catboost': None, 'random_forest': None}
# Initialize preprocessors for Random Forest
imputer = ImputeMissing()
label_encoder = CustomLabelEncoder()
# Initialize models based on problem type with appropriate parameters
if problem_type == 'regression':
models = {
'catboost': CatBoostRegressor(cat_features=valid_cat_features, **params),
'random_forest': RandomForestRegressor(**rf_params)
}
else:
models = {
'catboost': CatBoostClassifier(cat_features=valid_cat_features, **params),
'random_forest': RandomForestClassifier(**rf_params)
}
# Evaluate each model type
for model_name, model in models.items():
try:
if model_name == 'random_forest':
# For Random Forest: apply imputation and encoding
X_train_processed = label_encoder.fit_transform(
imputer.fit_transform(X_train_selected)
)
X_test_processed = label_encoder.transform(
imputer.transform(X_test)
)
else:
# For CatBoost: use original data with missing values
X_train_processed = X_train_selected
X_test_processed = X_test
# Train and evaluate model
model.fit(X_train_processed, y_train)
y_pred = get_predictions(model, X_test_processed, problem_type)
# Handle multiclass imbalanced case specially
if (problem_type == 'multiclass_classification' and
balanced_status == 'Imbalanced' and
'average_precision_score' in str(scorer._score_func)):
y_test_bin = label_binarize(y_test, classes=np.unique(y_train))
score = scorer._score_func(y_test_bin, y_pred, **scorer._kwargs)
else:
score = scorer._score_func(y_test, y_pred, **scorer._kwargs)
model_scores[model_name] = score
except Exception as e:
print(f"Warning: Error evaluating {model_name}: {str(e)}")
if problem_type == 'regression':
mean_prediction = y_train.mean()
worst_mae = np.mean(np.abs(y_test - mean_prediction))
model_scores[model_name] = worst_mae
else:
model_scores[model_name] = 0
return {
'models_performance': model_scores,
'execution_time': execution_time,
'n_selected_features': len(X_train_selected.columns)
}
def initialize_and_run_feature_selection(method_name, X, y, task_type, categorical_features, is_balanced):
"""
Initialize and execute a specified feature selection method on a dataset.
This function handles the complete feature selection pipeline:
1. Preprocessing data for the specific method
2. Creating and configuring the feature selector
3. Fitting and transforming the data
4. Measuring execution time
Args:
method_name (str): Name of the feature selection method to use
X (pd.DataFrame): Input features to perform selection on
y (array-like): Target variable
task_type (str): Type of ML task ('regression', 'binary_classification' or 'multiclass_classification)
categorical_features (list): List of categorical feature names
is_balanced (str): Dataset balance status ('Balanced' or 'Imbalanced')
Returns:
tuple: Contains:
- pd.DataFrame or None: Selected features if successful, None if failed
- float: Execution time in seconds (0 if failed)
"""
# Preprocess data according to method-specific requirements
X_processed, y_processed = preprocess_methods(
X, y, task_type, method_name
)
# Start timing the feature selection process
start_time = time.time()
# Identify categorical columns in processed data
categorical_cols = X_processed.select_dtypes(include=['object', 'category']).columns.tolist()
# Initialize the appropriate feature selector
selector = initialize_selector(
method_name,
task_type,
X_processed.shape[0],
categorical_cols,
is_balanced
)
# Early return if selector initialization failed
if selector is None:
return None, 0
try:
# Apply feature selection
X_selected_processed = selector.fit_transform(X_processed, y_processed)
# Map selected features back to original dataset
selected_columns = X_selected_processed.columns
X_selected = X[selected_columns]
# Return selected features and execution time
return X_selected, time.time() - start_time
except Exception as e:
# Handle any errors during feature selection
print(f"Error in {method_name}: {str(e)}")
return None, 0
def initialize_selector(method_name, task_type, n_samples, categorical_features, balanced_status):
"""
Initialize and configure a feature selection method based on specified parameters.
This function serves as a factory for creating feature selectors, configuring them
with appropriate parameters based on the task type and data characteristics.
Args:
method_name (str): Name of the feature selection method to initialize
task_type (str): Type of ML task (regression/classification)
n_samples (int): Number of samples in dataset
categorical_features (list): List of categorical feature names
balanced_status (str): Whether dataset is 'Balanced' or 'Imbalanced'
Returns:
object: Initialized feature selector instance configured for the specified task
"""
# Get appropriate model, scoring metric and CV strategy for the task
model, params, scorer, cv, metric_direction = get_model_and_scorer(
task_type,
balanced_status,
categorical_features,
n_samples
)
# Initialize Random Forest model for embedded method rf_fimportances
rf_embedded = (RandomForestRegressor if task_type == 'regression' else RandomForestClassifier)(
**rf_params,
min_samples_leaf=max(1, round(n_samples*2/3*0.01))
)
# Dictionary mapping method names to their initialization functions
selector_map = {
# Filter
'chi_squared': lambda: ChiSquaredSelector(
alpha=0.05
),
'information_value': lambda: WOEInformationValueSelector(
threshold_iv=0.02,
task=task_type
),
'correlation': lambda: CorrelationSelector(
threshold=0.1,
task=task_type
),
'mutual_info': lambda: NormalizedMutualInfoSelector(
threshold=0.01,
),
'mrmr': lambda: MRMRSelector(
threshold=0.01
),
'fcbf': lambda: FCBFSelector(
threshold=0.01
),
'relief': lambda: ReliefSelector(
k=10,
threshold=0.1,
sigma=0.5,
n_jobs=-1,
task=task_type
),
# Wrapper
'seq_backward': lambda: SeqBackSelectorCV(
model=model, scorer=scorer, metric_direction=metric_direction,
cv=cv, categorical_features=categorical_features, task=task_type,
loss_threshold=0.001,
min_features=1
),
'seq_forward': lambda: SeqForwardSelectorCV(
model=model, scorer=scorer, metric_direction=metric_direction,
cv=cv, categorical_features=categorical_features, task=task_type,
min_improvement=0.001
),
'seq_forward_floating': lambda: SeqForwardFloatingSelectorCV(
model=model, scorer=scorer, metric_direction=metric_direction,
cv=cv, categorical_features=categorical_features, task=task_type,
min_improvement=0.001
),
'seq_backward_floating': lambda: SeqBackFloatingSelectorCV(
model=model, scorer=scorer, metric_direction=metric_direction,
loss_threshold=0.001, min_features=1, cv=cv,
categorical_features=categorical_features, task=task_type
),
# Embedded
'hybrid_rfe': lambda: HybridRFE(
model=model, scorer=scorer, metric_direction=metric_direction,
cv=cv, categorical_features=categorical_features, task=task_type,
score_threshold=0.015,
min_features=1
),
'rf_fimportances': lambda: RFFearureImportanceSelector(
rf_model=rf_embedded,
threshold=0.01
),
'cb_fimportances': lambda: CatBoostFeatureImportanceSelector(
model=model,
threshold=0.01
),
'permutation_importance': lambda: PermutationImportanceSelector(
model=model, scorer=scorer, metric_direction=metric_direction,
n_repeats=5,
cv=cv, task=task_type, threshold=0.01
),
# Advanced
'shap': lambda: ShapFeatureImportanceSelector(
model=model, cv=cv, threshold=0.01, task=task_type,
operation_timeout=300
),
'boruta': lambda: CatBoostBoruta(
model_params=params, n_iterations=100, alpha=0.05, random_state=42,
categorical_features=categorical_features
),
# Hybrid
'hybrid_fcbf_sfs': lambda: HybridFcbfSfs(
model=model, scorer=scorer, metric_direction=metric_direction,
cv=cv, categorical_features=categorical_features, task=task_type,
min_improvement=0.001, fcbf_threshold=0.001
),
'hybrid_nmi_sfs': lambda: HybridNmiSfs(
model=model, scorer=scorer, metric_direction=metric_direction,
cv=cv, categorical_features=categorical_features, task=task_type,
min_improvement=0.001, threshold=0.001
),
'hybrid_shap_sfs': lambda: HybridShapSfs(
model=model, scorer=scorer, metric_direction=metric_direction,
cv=cv, categorical_features=categorical_features, task=task_type,
min_improvement=0.001, operation_timeout=300, threshold=0.005
)
}
# Return initialized selector or None if method not found
return selector_map.get(method_name, lambda: None)()
def specific_dataset_cleanings(X, dataset_name, y):
"""
Apply dataset-specific cleaning operations based on the dataset name.
This function handles known issues or requirements for specific datasets.
Parameters:
-----------
X : pandas.DataFrame
The input dataset to be cleaned
dataset_name : str
Name of the dataset to determine which specific cleaning operations to apply
Returns:
--------
pandas.DataFrame
The cleaned dataset with dataset-specific transformations applied
Notes:
------
Current dataset-specific operations:
- speeddating: Removes 'decision' and 'decision_o' columns due to data leakage
These columns contain information that would not be available at prediction time
and could lead to overly optimistic model performance
- weatherAUS: Removes samples where there the target is a missing value
"""
# Create a copy of the input DataFrame to avoid modifying the original
X_cleaned = X.copy()
y_cleaned = y.copy()
# Handle speeddating dataset
if dataset_name == 'speeddating':
# Remove columns that could cause data leakage
leakage_columns = ['decision', 'decision_o']
# Check if the columns exist before trying to drop them
existing_leakage_columns = [col for col in leakage_columns if col in X_cleaned.columns]
if existing_leakage_columns:
X_cleaned = X_cleaned.drop(columns=existing_leakage_columns)
print(f"Removed data leakage columns from {dataset_name}: {existing_leakage_columns}")
elif dataset_name == 'weatherAUS':
# Get indices where target is not NaN
valid_indices = ~y_cleaned.isna()
# Filter both X and y to keep only samples where target is not NaN
X_cleaned = X_cleaned[valid_indices]
y_cleaned = y_cleaned[valid_indices]
n_removed = sum(~valid_indices)
print(f"Removed {n_removed} samples with NaN targets from {dataset_name}")
print(f"Remaining samples: {len(y_cleaned)}")
# Handle speeddating dataset
elif dataset_name == 'irish':
# Remove columns that could cause data leakage
leakage_columns = ['Educational_level']
# Check if the columns exist before trying to drop them
existing_leakage_columns = [col for col in leakage_columns if col in X_cleaned.columns]
if existing_leakage_columns:
X_cleaned = X_cleaned.drop(columns=existing_leakage_columns)
print(f"Removed data leakage columns from {dataset_name}: {existing_leakage_columns}")
return X_cleaned, y_cleaned
def preprocess_methods(X, y, task_type, method_name, num_bins=5):
"""
Preprocess data specifically for different feature selection methods.
Parameters:
-----------
X : pd.DataFrame
Input features
y : pd.Series or pd.DataFrame
Target variable
task_type : str
Type of task ('regression', 'binary_classification', 'multiclass_classification')
num_bins : int, default=5
Number of bins for discretizing continuous features
Returns:
--------
tuple : (pd.DataFrame, pd.Series)
Preprocessed features and target
"""
X_transformed = X.copy()
y_transformed = y.copy()
sampe_process_list = ['mutual_info', 'fcbf', 'mrmr']
if method_name == 'chi_squared':
# ensure it's a Series
if isinstance(y_transformed, pd.DataFrame):
y_transformed = y_transformed.iloc[:, 0]
else:
y_transformed = pd.Series(y_transformed)
# Impute if there are missing values
if X_transformed.isna().any().any():
X_transformed = ImputeMissing().fit_transform(X_transformed)
# Discretize continuous features
X_transformed = kmeans_discretize(X_transformed, num_bins)
if task_type == 'regression':
y_transformed = kmeans_discretize(y_transformed, num_bins)
elif method_name == 'correlation':
# Ensure y is a Series
if isinstance(y_transformed, pd.DataFrame):
y_transformed = y_transformed.iloc[:, 0]
# Impute if there are missing values
if X_transformed.isna().any().any():
X_transformed = ImputeMissing().fit_transform(X_transformed)
# Get encoded DataFrame with categorical features transformed
X_transformed = target_encode_variables(X_transformed, y_transformed)
elif method_name == 'information_value':
# Ensure y is a Series
if isinstance(y_transformed, pd.DataFrame):
y_transformed = y_transformed.iloc[:, 0]
# Impute if there are missing values
if X_transformed.isna().any().any():
X_transformed = ImputeMissing().fit_transform(X_transformed)
# Discretize numerical variables
X_transformed = kmeans_discretize(X_transformed, num_bins)
elif method_name in sampe_process_list:
# Handle target variable - ensure it's a Series
if isinstance(y_transformed, pd.DataFrame):
y_transformed = y_transformed.iloc[:, 0] # Take first column if DataFrame
else:
y_transformed = pd.Series(y_transformed)
# Impute if there are missing values
if X_transformed.isna().any().any():
X_transformed = ImputeMissing().fit_transform(X_transformed)
# Encode categorical variables
X_transformed = encode_categorical_features(X_transformed)
# Discretize numeric variables
X_transformed = kmeans_discretize(X_transformed, num_bins)
if task_type == 'regression':
y_transformed = kmeans_discretize(y_transformed, num_bins)
elif method_name == 'relief':
# Impute if there are missing values
if X_transformed.isna().any().any():
X_transformed = ImputeMissing().fit_transform(X_transformed)
X_transformed = encode_categorical_features(X_transformed)
# Handle different types of y input and convert to numpy
if isinstance(y_transformed, pd.DataFrame):
y_transformed = y_transformed.iloc[:, 0].values # Convert to numpy immediately
else:
y_transformed = pd.Series(y_transformed).values if not isinstance(y_transformed, np.ndarray) else y
elif method_name == 'rf_fimportances':
# Random forest does not accept missings and categorical features
# Impute if there are missing values
if X_transformed.isna().any().any():
X_transformed = ImputeMissing().fit_transform(X_transformed)
X_transformed = encode_categorical_features(X_transformed)
# Handle target variable - ensure it's a Series
if isinstance(y_transformed, pd.DataFrame):
y_transformed = y_transformed.iloc[:, 0] # Take first column if DataFrame
else:
y_transformed = pd.Series(y_transformed)
return X_transformed, y_transformed