- Added a section to display test metrics for model performance on the held-out test set. - Implemented confusion matrix visualization to analyze prediction breakdown. - Refactored sidebar settings to streamline metric selection and improve user experience. - Updated cross-validation statistics to compare CV performance with test metrics. - Enhanced DatasetEnsemble methods to handle empty data scenarios gracefully. - Introduced debug scripts to assist in identifying feature mismatches and validating dataset preparation. - Added comprehensive tests for DatasetEnsemble to ensure feature consistency and correct behavior across various scenarios.
72 lines
2.5 KiB
Python
72 lines
2.5 KiB
Python
"""Debug script to identify feature mismatch between training and inference."""
|
|
|
|
from entropice.ml.dataset import DatasetEnsemble
|
|
|
|
# Test with level 6 (the actual level used in production)
|
|
ensemble = DatasetEnsemble(
|
|
grid="healpix",
|
|
level=10,
|
|
target="darts_mllabels",
|
|
members=[
|
|
"AlphaEarth",
|
|
"ArcticDEM",
|
|
"ERA5-yearly",
|
|
"ERA5-seasonal",
|
|
"ERA5-shoulder",
|
|
],
|
|
add_lonlat=True,
|
|
filter_target=False,
|
|
)
|
|
|
|
print("=" * 80)
|
|
print("Creating training dataset...")
|
|
print("=" * 80)
|
|
training_data = ensemble.create_cat_training_dataset(task="binary", device="cpu")
|
|
training_features = set(training_data.X.data.columns)
|
|
print(f"\nTraining dataset created with {len(training_features)} features")
|
|
print(f"Sample features: {sorted(list(training_features))[:10]}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("Creating inference batch...")
|
|
print("=" * 80)
|
|
batch_generator = ensemble.create_batches(batch_size=100, cache_mode="n")
|
|
batch = next(batch_generator, None)
|
|
# for batch in batch_generator:
|
|
if batch is None:
|
|
print("ERROR: No batch created!")
|
|
else:
|
|
print(f"\nBatch created with {len(batch.columns)} columns")
|
|
print(f"Batch columns: {sorted(batch.columns)[:15]}")
|
|
|
|
# Simulate the column dropping in predict_proba (inference.py)
|
|
cols_to_drop = ["geometry"]
|
|
if ensemble.target == "darts_mllabels":
|
|
cols_to_drop += [col for col in batch.columns if col.startswith("dartsml_")]
|
|
else:
|
|
cols_to_drop += [col for col in batch.columns if col.startswith("darts_")]
|
|
|
|
print(f"\nColumns to drop: {cols_to_drop}")
|
|
|
|
inference_batch = batch.drop(columns=cols_to_drop)
|
|
inference_features = set(inference_batch.columns)
|
|
|
|
print(f"\nInference batch after dropping has {len(inference_features)} features")
|
|
print(f"Sample features: {sorted(list(inference_features))[:10]}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("COMPARISON")
|
|
print("=" * 80)
|
|
print(f"Training features: {len(training_features)}")
|
|
print(f"Inference features: {len(inference_features)}")
|
|
|
|
if training_features == inference_features:
|
|
print("\n✅ SUCCESS: Features match perfectly!")
|
|
else:
|
|
print("\n❌ MISMATCH DETECTED!")
|
|
only_in_training = training_features - inference_features
|
|
only_in_inference = inference_features - training_features
|
|
|
|
if only_in_training:
|
|
print(f"\n⚠️ Only in TRAINING ({len(only_in_training)}): {sorted(only_in_training)}")
|
|
if only_in_inference:
|
|
print(f"\n⚠️ Only in INFERENCE ({len(only_in_inference)}): {sorted(only_in_inference)}")
|