Enhance training analysis page with test metrics and confusion matrix

- Added a section to display test metrics for model performance on the held-out test set. - Implemented confusion matrix visualization to analyze prediction breakdown. - Refactored sidebar settings to streamline metric selection and improve user experience. - Updated cross-validation statistics to compare CV performance with test metrics. - Enhanced DatasetEnsemble methods to handle empty data scenarios gracefully. - Introduced debug scripts to assist in identifying feature mismatches and validating dataset preparation. - Added comprehensive tests for DatasetEnsemble to ensure feature consistency and correct behavior across various scenarios.
2026-01-07 15:56:02 +01:00 · 2026-01-07 15:56:02 +01:00 · c92e856c55
commit c92e856c55
parent 4fecac535c
23 changed files with 1845 additions and 484 deletions
--- a/tests/debug_feature_mismatch.py
+++ b/tests/debug_feature_mismatch.py
@ -0,0 +1,72 @@
+"""Debug script to identify feature mismatch between training and inference."""
+
+from entropice.ml.dataset import DatasetEnsemble
+
+# Test with level 6 (the actual level used in production)
+ensemble = DatasetEnsemble(
+    grid="healpix",
+    level=10,
+    target="darts_mllabels",
+    members=[
+        "AlphaEarth",
+        "ArcticDEM",
+        "ERA5-yearly",
+        "ERA5-seasonal",
+        "ERA5-shoulder",
+    ],
+    add_lonlat=True,
+    filter_target=False,
+)
+
+print("=" * 80)
+print("Creating training dataset...")
+print("=" * 80)
+training_data = ensemble.create_cat_training_dataset(task="binary", device="cpu")
+training_features = set(training_data.X.data.columns)
+print(f"\nTraining dataset created with {len(training_features)} features")
+print(f"Sample features: {sorted(list(training_features))[:10]}")
+
+print("\n" + "=" * 80)
+print("Creating inference batch...")
+print("=" * 80)
+batch_generator = ensemble.create_batches(batch_size=100, cache_mode="n")
+batch = next(batch_generator, None)
+# for batch in batch_generator:
+if batch is None:
+    print("ERROR: No batch created!")
+else:
+    print(f"\nBatch created with {len(batch.columns)} columns")
+    print(f"Batch columns: {sorted(batch.columns)[:15]}")
+
+    # Simulate the column dropping in predict_proba (inference.py)
+    cols_to_drop = ["geometry"]
+    if ensemble.target == "darts_mllabels":
+        cols_to_drop += [col for col in batch.columns if col.startswith("dartsml_")]
+    else:
+        cols_to_drop += [col for col in batch.columns if col.startswith("darts_")]
+
+    print(f"\nColumns to drop: {cols_to_drop}")
+
+    inference_batch = batch.drop(columns=cols_to_drop)
+    inference_features = set(inference_batch.columns)
+
+    print(f"\nInference batch after dropping has {len(inference_features)} features")
+    print(f"Sample features: {sorted(list(inference_features))[:10]}")
+
+    print("\n" + "=" * 80)
+    print("COMPARISON")
+    print("=" * 80)
+    print(f"Training features: {len(training_features)}")
+    print(f"Inference features: {len(inference_features)}")
+
+    if training_features == inference_features:
+        print("\n✅ SUCCESS: Features match perfectly!")
+    else:
+        print("\n❌ MISMATCH DETECTED!")
+        only_in_training = training_features - inference_features
+        only_in_inference = inference_features - training_features
+
+        if only_in_training:
+            print(f"\n⚠️  Only in TRAINING ({len(only_in_training)}): {sorted(only_in_training)}")
+        if only_in_inference:
+            print(f"\n⚠️  Only in INFERENCE ({len(only_in_inference)}): {sorted(only_in_inference)}")