entropice/tests/debug_feature_mismatch.py

"""Debug script to identify feature mismatch between training and inference."""

from entropice.ml.dataset import DatasetEnsemble

# Test with level 6 (the actual level used in production)
ensemble = DatasetEnsemble(
    grid="healpix",
    level=10,
    target="darts_mllabels",
    members=[
        "AlphaEarth",
        "ArcticDEM",
        "ERA5-yearly",
        "ERA5-seasonal",
        "ERA5-shoulder",
    ],
    add_lonlat=True,
    filter_target=False,
)

print("=" * 80)
print("Creating training dataset...")
print("=" * 80)
training_data = ensemble.create_cat_training_dataset(task="binary", device="cpu")
training_features = set(training_data.X.data.columns)
print(f"\nTraining dataset created with {len(training_features)} features")
print(f"Sample features: {sorted(list(training_features))[:10]}")

print("\n" + "=" * 80)
print("Creating inference batch...")
print("=" * 80)
batch_generator = ensemble.create_batches(batch_size=100, cache_mode="n")
batch = next(batch_generator, None)
# for batch in batch_generator:
if batch is None:
    print("ERROR: No batch created!")
else:
    print(f"\nBatch created with {len(batch.columns)} columns")
    print(f"Batch columns: {sorted(batch.columns)[:15]}")

    # Simulate the column dropping in predict_proba (inference.py)
    cols_to_drop = ["geometry"]
    if ensemble.target == "darts_mllabels":
        cols_to_drop += [col for col in batch.columns if col.startswith("dartsml_")]
    else:
        cols_to_drop += [col for col in batch.columns if col.startswith("darts_")]

    print(f"\nColumns to drop: {cols_to_drop}")

    inference_batch = batch.drop(columns=cols_to_drop)
    inference_features = set(inference_batch.columns)

    print(f"\nInference batch after dropping has {len(inference_features)} features")
    print(f"Sample features: {sorted(list(inference_features))[:10]}")

    print("\n" + "=" * 80)
    print("COMPARISON")
    print("=" * 80)
    print(f"Training features: {len(training_features)}")
    print(f"Inference features: {len(inference_features)}")

    if training_features == inference_features:
        print("\n✅ SUCCESS: Features match perfectly!")
    else:
        print("\n❌ MISMATCH DETECTED!")
        only_in_training = training_features - inference_features
        only_in_inference = inference_features - training_features

        if only_in_training:
            print(f"\n⚠️  Only in TRAINING ({len(only_in_training)}): {sorted(only_in_training)}")
        if only_in_inference:
            print(f"\n⚠️  Only in INFERENCE ({len(only_in_inference)}): {sorted(only_in_inference)}")
Enhance training analysis page with test metrics and confusion matrix - Added a section to display test metrics for model performance on the held-out test set. - Implemented confusion matrix visualization to analyze prediction breakdown. - Refactored sidebar settings to streamline metric selection and improve user experience. - Updated cross-validation statistics to compare CV performance with test metrics. - Enhanced DatasetEnsemble methods to handle empty data scenarios gracefully. - Introduced debug scripts to assist in identifying feature mismatches and validating dataset preparation. - Added comprehensive tests for DatasetEnsemble to ensure feature consistency and correct behavior across various scenarios. 2026-01-07 15:56:02 +01:00			`"""Debug script to identify feature mismatch between training and inference."""`

			`from entropice.ml.dataset import DatasetEnsemble`

			`# Test with level 6 (the actual level used in production)`
			`ensemble = DatasetEnsemble(`
			`grid="healpix",`
			`level=10,`
			`target="darts_mllabels",`
			`members=[`
			`"AlphaEarth",`
			`"ArcticDEM",`
			`"ERA5-yearly",`
			`"ERA5-seasonal",`
			`"ERA5-shoulder",`
			`],`
			`add_lonlat=True,`
			`filter_target=False,`
			`)`

			`print("=" * 80)`
			`print("Creating training dataset...")`
			`print("=" * 80)`
			`training_data = ensemble.create_cat_training_dataset(task="binary", device="cpu")`
			`training_features = set(training_data.X.data.columns)`
			`print(f"\nTraining dataset created with {len(training_features)} features")`
			`print(f"Sample features: {sorted(list(training_features))[:10]}")`

			`print("\n" + "=" * 80)`
			`print("Creating inference batch...")`
			`print("=" * 80)`
			`batch_generator = ensemble.create_batches(batch_size=100, cache_mode="n")`
			`batch = next(batch_generator, None)`
			`# for batch in batch_generator:`
			`if batch is None:`
			`print("ERROR: No batch created!")`
			`else:`
			`print(f"\nBatch created with {len(batch.columns)} columns")`
			`print(f"Batch columns: {sorted(batch.columns)[:15]}")`

			`# Simulate the column dropping in predict_proba (inference.py)`
			`cols_to_drop = ["geometry"]`
			`if ensemble.target == "darts_mllabels":`
			`cols_to_drop += [col for col in batch.columns if col.startswith("dartsml_")]`
			`else:`
			`cols_to_drop += [col for col in batch.columns if col.startswith("darts_")]`

			`print(f"\nColumns to drop: {cols_to_drop}")`

			`inference_batch = batch.drop(columns=cols_to_drop)`
			`inference_features = set(inference_batch.columns)`

			`print(f"\nInference batch after dropping has {len(inference_features)} features")`
			`print(f"Sample features: {sorted(list(inference_features))[:10]}")`

			`print("\n" + "=" * 80)`
			`print("COMPARISON")`
			`print("=" * 80)`
			`print(f"Training features: {len(training_features)}")`
			`print(f"Inference features: {len(inference_features)}")`

			`if training_features == inference_features:`
			`print("\n✅ SUCCESS: Features match perfectly!")`
			`else:`
			`print("\n❌ MISMATCH DETECTED!")`
			`only_in_training = training_features - inference_features`
			`only_in_inference = inference_features - training_features`

			`if only_in_training:`
			`print(f"\n⚠️ Only in TRAINING ({len(only_in_training)}): {sorted(only_in_training)}")`
			`if only_in_inference:`
			`print(f"\n⚠️ Only in INFERENCE ({len(only_in_inference)}): {sorted(only_in_inference)}")`