Unfiy the training scripts and add SHAP

2026-02-08 22:06:49 +01:00 · 2026-02-08 22:06:49 +01:00 · 3ce6b6e867
commit 3ce6b6e867
parent 073502c51d
38 changed files with 5876 additions and 681 deletions
--- a/autogluon-config.toml
+++ b/autogluon-config.toml
@ -0,0 +1,14 @@
 [tool.entropice-autogluon]
 time-limit = 60
 presets = "medium"
 target = "darts_v1"
 task = "density"
 experiment = "tobi-tests"
 grid = "hex"
 level = 5
 members = ["ERA5-shoulder", "ArcticDEM"]
 [tool.entropice-autogluon.dimension-filters]
 # ERA5-shoulder = { aggregations = "median" }
 ArcticDEM = { aggregations = "median" }
--- a/hpsearchcv-config.toml
+++ b/hpsearchcv-config.toml
@ -0,0 +1,19 @@
 [tool.entropice-hpsearchcv]
 n-iter = 5
 target = "darts_v1"
 task = "binary"
 splitter = "kfold"
 # model = "xgboost"
 # model = "rf" SHAP error
 model = "espa"
 experiment = "tobi-tests"
 scaler = "standard"       # They dont work because of Array API
 normalize = true
 grid = "hex"
 level = 5
 members = ["ERA5-shoulder", "ArcticDEM"]
 [tool.entropice-hpsearchcv.dimension-filters]
 # ERA5-shoulder = { aggregations = "median" }
 ArcticDEM = { aggregations = "median" }
--- a/pixi.lock
+++ b/pixi.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -67,8 +67,10 @@ dependencies = [
    "ruff>=0.14.11,<0.15",
    "pandas-stubs>=2.3.3.251201,<3",
    "pytest>=9.0.2,<10",
-    "autogluon-tabular[all,mitra]>=1.5.0",
+    "autogluon-tabular[all,mitra,realmlp,interpret,fastai,tabm,tabpfn,tabdpt,tabpfnmix,tabicl,skew,imodels]>=1.5.0",
-    "shap>=0.50.0,<0.51", "h5py>=3.15.1,<4",
+    "shap>=0.50.0,<0.51",
    "h5py>=3.15.1,<4",
    "pydantic>=2.12.5,<3",
 ]
 [project.scripts]
@ -77,8 +79,8 @@ darts = "entropice.ingest.darts:cli"
 alpha-earth = "entropice.ingest.alphaearth:main"
 era5 = "entropice.ingest.era5:cli"
 arcticdem = "entropice.ingest.arcticdem:cli"
-train = "entropice.ml.training:cli"
+train = "entropice.ml.hpsearchcv:cli"
-autogluon = "entropice.ml.autogluon_training:cli"
+autogluon = "entropice.ml.autogluon:cli"
 [build-system]
 requires = ["hatchling"]
--- a/src/entropice/dashboard/app.py
+++ b/src/entropice/dashboard/app.py
@ -5,6 +5,7 @@ Pages:
 - Overview: List of available result directories with some summary statistics.
 - Training Data: Visualization of training data distributions.
 - Training Results Analysis: Analysis of training results and model performance.
 - Experiment Analysis: Compare multiple training runs within an experiment.
 - AutoGluon Analysis: Analysis of AutoGluon training results with SHAP visualizations.
 - Model State: Visualization of model state and features.
 - Inference: Visualization of inference results.
@ -13,6 +14,7 @@ Pages:
 import streamlit as st
 from entropice.dashboard.views.dataset_page import render_dataset_page
 from entropice.dashboard.views.experiment_analysis_page import render_experiment_analysis_page
 from entropice.dashboard.views.inference_page import render_inference_page
 from entropice.dashboard.views.model_state_page import render_model_state_page
 from entropice.dashboard.views.overview_page import render_overview_page
@ -27,6 +29,7 @@ def main():
    overview_page = st.Page(render_overview_page, title="Overview", icon="🏡", default=True)
    data_page = st.Page(render_dataset_page, title="Dataset", icon="📊")
    training_analysis_page = st.Page(render_training_analysis_page, title="Training Results Analysis", icon="🦾")
    experiment_analysis_page = st.Page(render_experiment_analysis_page, title="Experiment Analysis", icon="🔬")
    model_state_page = st.Page(render_model_state_page, title="Model State", icon="🧮")
    inference_page = st.Page(render_inference_page, title="Inference", icon="🗺️")
@ -34,7 +37,7 @@ def main():
        {
            "Overview": [overview_page],
            "Data": [data_page],
-            "Experiments": [training_analysis_page, model_state_page],
+            "Experiments": [training_analysis_page, experiment_analysis_page, model_state_page],
            "Inference": [inference_page],
        }
    )
--- a/src/entropice/dashboard/plots/init.py
+++ b/src/entropice/dashboard/plots/init.py
--- a/src/entropice/dashboard/plots/correlations.py
+++ b/src/entropice/dashboard/plots/correlations.py
@ -0,0 +1,619 @@
 """Plots for cross-dataset correlation and similarity analysis."""
 from typing import Literal
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
 import seaborn as sns
 from plotly.subplots import make_subplots
 from scipy.cluster.hierarchy import dendrogram, linkage
 from scipy.spatial.distance import squareform
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 def select_top_variable_features(
    data_dict: dict[str, pd.Series],
    n_features: int = 500,
    method: Literal["variance", "iqr", "cv"] = "variance",
 ) -> dict[str, pd.Series]:
    """Select the top N most variable features from a dataset.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series
        n_features: Number of features to select
        method: Method to measure variability ('variance', 'iqr', or 'cv')
    Returns:
        Filtered dictionary with only the most variable features
    """
    if len(data_dict) <= n_features:
        return data_dict
    # Calculate variability metric for each feature
    variability_scores = {}
    for var_name, values in data_dict.items():
        clean_values = values.dropna()
        if len(clean_values) == 0:
            variability_scores[var_name] = 0
            continue
        if method == "variance":
            variability_scores[var_name] = clean_values.var()
        elif method == "iqr":
            variability_scores[var_name] = clean_values.quantile(0.75) - clean_values.quantile(0.25)
        elif method == "cv":
            mean_val = clean_values.mean()
            if abs(mean_val) > 1e-10:
                variability_scores[var_name] = clean_values.std() / abs(mean_val)
            else:
                variability_scores[var_name] = 0
    # Sort by variability and select top N
    sorted_vars = sorted(variability_scores.items(), key=lambda x: x[1], reverse=True)
    top_vars = [var_name for var_name, _ in sorted_vars[:n_features]]
    return {k: v for k, v in data_dict.items() if k in top_vars}
 def create_matplotlib_correlation_heatmap(
    data_dict: dict[str, pd.Series],
    method: Literal["pearson", "kendall", "spearman"] = "pearson",
    cluster: bool = False,
    max_labels: int = 100,
 ) -> plt.Figure:
    """Create a correlation heatmap using matplotlib for large datasets.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series with cell_ids as index
        method: Correlation method ('pearson', 'spearman', or 'kendall')
        cluster: Whether to reorder variables by hierarchical clustering
        max_labels: Maximum number of labels to show on axes
    Returns:
        Matplotlib Figure with correlation heatmap
    """
    if len(data_dict) < 2:
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.text(0.5, 0.5, "Need at least 2 variables for correlation analysis", ha="center", va="center", fontsize=12)
        ax.axis("off")
        return fig
    # Create DataFrame from all variables
    df = pd.DataFrame(data_dict)
    # Drop rows with any NaN to get valid correlations
    df_clean = df.dropna()
    if len(df_clean) == 0:
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.text(0.5, 0.5, "No overlapping non-null data between variables", ha="center", va="center", fontsize=12)
        ax.axis("off")
        return fig
    # Subsample if too many rows for performance
    if len(df_clean) > 50000:
        df_clean = df_clean.sample(n=50000, random_state=42)
    # Calculate correlation matrix
    corr = df_clean.corr(method=method)
    # Apply hierarchical clustering if requested
    if cluster and len(corr) > 2:
        # Use correlation distance for clustering
        corr_dist = 1 - np.abs(corr.values)
        np.fill_diagonal(corr_dist, 0)
        condensed_dist = squareform(corr_dist, checks=False)
        linkage_matrix = linkage(condensed_dist, method="average")
        dendro = dendrogram(linkage_matrix, no_plot=True)
        cluster_order = dendro["leaves"]
        # Reorder correlation matrix
        corr = corr.iloc[cluster_order, cluster_order]
    # Determine figure size based on number of variables
    n_vars = len(corr)
    fig_size = max(10, min(n_vars * 0.3, 50))
    # Create figure
    fig, ax = plt.subplots(figsize=(fig_size, fig_size))
    # Create heatmap using seaborn
    sns.heatmap(
        corr,
        cmap="RdBu_r",
        center=0,
        vmin=-1,
        vmax=1,
        square=True,
        linewidths=0.5 if n_vars < 50 else 0,
        cbar_kws={"shrink": 0.8, "label": f"{method.title()} Correlation"},
        ax=ax,
        xticklabels=n_vars <= max_labels,
        yticklabels=n_vars <= max_labels,
    )
    # Set title
    title = f"Correlation Matrix ({method.title()}, {n_vars} variables)"
    if cluster:
        title += " - Hierarchically Clustered"
    ax.set_title(title, fontsize=14, pad=20)
    # Rotate labels if shown
    if n_vars <= max_labels:
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", fontsize=8)
        plt.setp(ax.get_yticklabels(), rotation=0, fontsize=8)
    plt.tight_layout()
    return fig
 def create_full_correlation_heatmap(
    data_dict: dict[str, pd.Series],
    method: Literal["pearson", "kendall", "spearman"] = "pearson",
    cluster: bool = False,
 ) -> go.Figure:
    """Create a correlation heatmap for all variables across all datasets.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series with cell_ids as index
        method: Correlation method ('pearson', 'spearman', or 'kendall')
        cluster: Whether to reorder variables by hierarchical clustering
    Returns:
        Plotly Figure with correlation heatmap
    """
    if len(data_dict) < 2:
        return go.Figure().add_annotation(
            text="Need at least 2 variables for correlation analysis",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    # Create DataFrame from all variables
    df = pd.DataFrame(data_dict)
    # Drop rows with any NaN to get valid correlations
    df_clean = df.dropna()
    if len(df_clean) == 0:
        return go.Figure().add_annotation(
            text="No overlapping non-null data between variables",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    # Subsample if too many rows for performance
    if len(df_clean) > 50000:
        df_clean = df_clean.sample(n=50000, random_state=42)
    # Calculate correlation matrix
    corr = df_clean.corr(method=method)
    # Apply hierarchical clustering if requested
    if cluster and len(corr) > 2:
        # Use correlation distance for clustering
        corr_dist = 1 - np.abs(corr.values)
        np.fill_diagonal(corr_dist, 0)  # Ensure diagonal is 0
        condensed_dist = squareform(corr_dist, checks=False)
        linkage_matrix = linkage(condensed_dist, method="average")
        dendro = dendrogram(linkage_matrix, no_plot=True)
        cluster_order = dendro["leaves"]
        # Reorder correlation matrix
        corr = corr.iloc[cluster_order, cluster_order]
    # Create heatmap
    fig = go.Figure(
        data=go.Heatmap(
            z=corr.values,
            x=corr.columns,
            y=corr.index,
            colorscale="RdBu_r",
            zmid=0,
            zmin=-1,
            zmax=1,
            text=np.round(corr.values, 2),
            texttemplate="%{text}",
            textfont={"size": 8},
            colorbar={"title": f"{method.title()}<br>Correlation"},
            hovertemplate=f"<b>%{{y}}</b> vs <b>%{{x}}</b><br>{method.title()} correlation: %{{z:.3f}}<extra></extra>",
        )
    )
    title = f"Cross-Dataset Variable Correlations ({method.title()})"
    if cluster:
        title += " - Hierarchically Clustered"
    fig.update_layout(
        title=title,
        height=max(600, len(corr) * 15),
        width=max(800, len(corr) * 15),
        xaxis={"side": "bottom", "tickangle": 45},
        yaxis={"tickangle": 0},
    )
    return fig
 def create_pca_biplot(
    data_dict: dict[str, pd.Series],
    n_components: int = 2,
    show_loadings: bool = True,
 ) -> go.Figure:
    """Create a PCA biplot showing feature relationships in principal component space.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series with cell_ids as index
        n_components: Number of principal components to compute
        show_loadings: Whether to show loading vectors
    Returns:
        Plotly Figure with PCA biplot
    """
    if len(data_dict) < 2:
        return go.Figure().add_annotation(
            text="Need at least 2 variables for PCA analysis",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    # Create DataFrame and clean
    df = pd.DataFrame(data_dict).dropna()
    if len(df) < 10:
        return go.Figure().add_annotation(
            text="Not enough overlapping data for PCA analysis",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    # Subsample if needed
    if len(df) > 50000:
        df = df.sample(n=50000, random_state=42)
    # Standardize features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    # Perform PCA
    n_components = min(n_components, len(data_dict), len(df))
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(scaled_data)
    # Create figure
    fig = go.Figure()
    if n_components >= 2:
        # Scatter plot of observations (subsampled for visibility)
        sample_size = min(5000, len(pca_result))
        rng = np.random.default_rng(42)
        indices = rng.choice(len(pca_result), sample_size, replace=False)
        fig.add_trace(
            go.Scatter(
                x=pca_result[indices, 0],
                y=pca_result[indices, 1],
                mode="markers",
                marker={"size": 3, "opacity": 0.3, "color": "lightblue"},
                name="Observations",
                showlegend=True,
            )
        )
        # Add loading vectors if requested
        if show_loadings:
            loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
            # Scale loadings for visibility
            max_loading = np.abs(loadings[:, :2]).max()
            max_data = max(np.abs(pca_result[:, :2]).max(axis=0))
            scale = max_data / max_loading * 0.8
            for i, var_name in enumerate(df.columns):
                fig.add_trace(
                    go.Scatter(
                        x=[0, loadings[i, 0] * scale],
                        y=[0, loadings[i, 1] * scale],
                        mode="lines+text",
                        line={"color": "red", "width": 2},
                        text=["", var_name],
                        textposition="top center",
                        textfont={"size": 10, "color": "darkred"},
                        name=var_name,
                        showlegend=False,
                        hovertemplate=f"<b>{var_name}</b><br>"
                        + f"PC1 loading: {loadings[i, 0]:.3f}<br>"
                        + f"PC2 loading: {loadings[i, 1]:.3f}<extra></extra>",
                    )
                )
        var_exp = pca.explained_variance_ratio_
        fig.update_xaxes(title=f"PC1 ({var_exp[0] * 100:.1f}% variance)")
        fig.update_yaxes(title=f"PC2 ({var_exp[1] * 100:.1f}% variance)")
    fig.update_layout(
        title="PCA Biplot - Feature Relationships in Principal Component Space",
        height=700,
        showlegend=show_loadings,
        hovermode="closest",
    )
    return fig
 def create_dendrogram_plot(
    data_dict: dict[str, pd.Series],
    method: Literal["pearson", "kendall", "spearman"] = "pearson",
 ) -> go.Figure:
    """Create a dendrogram showing hierarchical clustering of variables based on correlation.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series with cell_ids as index
        method: Correlation method to use for distance calculation
    Returns:
        Plotly Figure with dendrogram
    """
    if len(data_dict) < 2:
        return go.Figure().add_annotation(
            text="Need at least 2 variables for clustering",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    # Create DataFrame and clean
    df = pd.DataFrame(data_dict).dropna()
    if len(df) < 10:
        return go.Figure().add_annotation(
            text="Not enough overlapping data for clustering",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    # Subsample if needed
    if len(df) > 50000:
        df = df.sample(n=50000, random_state=42)
    # Calculate correlation and convert to distance
    corr = df.corr(method=method)
    corr_dist = 1 - np.abs(corr.values)
    np.fill_diagonal(corr_dist, 0)
    # Perform hierarchical clustering
    condensed_dist = squareform(corr_dist, checks=False)
    linkage_matrix = linkage(condensed_dist, method="average")
    # Create dendrogram
    dendro = dendrogram(linkage_matrix, labels=list(df.columns), no_plot=True)
    # Extract dendrogram data
    icoord = np.array(dendro["icoord"])
    dcoord = np.array(dendro["dcoord"])
    # Create plotly figure
    fig = go.Figure()
    # Add dendrogram lines
    for i in range(len(icoord)):
        fig.add_trace(
            go.Scatter(
                x=icoord[i],
                y=dcoord[i],
                mode="lines",
                line={"color": "black", "width": 1.5},
                hoverinfo="skip",
                showlegend=False,
            )
        )
    # Add labels
    labels = dendro["ivl"]
    label_pos = np.arange(5, len(labels) * 10 + 5, 10)
    fig.update_layout(
        title=f"Hierarchical Clustering of Variables (based on {method.title()} correlation)",
        xaxis={
            "title": "Variables",
            "tickmode": "array",
            "tickvals": label_pos,
            "ticktext": labels,
            "tickangle": 45,
        },
        yaxis={"title": "Distance (1 - |correlation|)"},
        height=600,
        showlegend=False,
    )
    return fig
 def create_mutual_information_matrix(data_dict: dict[str, pd.Series]) -> go.Figure:
    """Create a mutual information matrix to capture non-linear relationships.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series with cell_ids as index
    Returns:
        Plotly Figure with mutual information heatmap
    """
    from sklearn.feature_selection import mutual_info_regression
    if len(data_dict) < 2:
        return go.Figure().add_annotation(
            text="Need at least 2 variables for mutual information analysis",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    # Create DataFrame and clean
    df = pd.DataFrame(data_dict).dropna()
    if len(df) < 100:
        return go.Figure().add_annotation(
            text="Not enough overlapping data for mutual information analysis",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    # Subsample if needed
    if len(df) > 10000:  # MI is computationally expensive
        df = df.sample(n=10000, random_state=42)
    # Calculate pairwise mutual information
    n_vars = len(df.columns)
    mi_matrix = np.zeros((n_vars, n_vars))
    for i, col_i in enumerate(df.columns):
        for j, col_j in enumerate(df.columns):
            if i == j:
                mi_matrix[i, j] = 1.0  # Self-information (normalized)
            elif i < j:
                # Calculate MI
                X = df[col_i].to_numpy().reshape(-1, 1)  # noqa: N806
                y = df[col_j].to_numpy()
                mi = mutual_info_regression(X, y, random_state=42)[0]
                # Normalize by entropy (approximate)
                mi_norm = mi / (np.std(y) * np.std(X.flatten()) + 1e-10)
                mi_matrix[i, j] = mi_norm
                mi_matrix[j, i] = mi_norm
    # Create heatmap
    fig = go.Figure(
        data=go.Heatmap(
            z=mi_matrix,
            x=list(df.columns),
            y=list(df.columns),
            colorscale="YlOrRd",
            zmin=0,
            text=np.round(mi_matrix, 3),
            texttemplate="%{text}",
            textfont={"size": 8},
            colorbar={"title": "Normalized<br>Mutual Info"},
            hovertemplate="<b>%{y}</b> vs <b>%{x}</b><br>Mutual Information: %{z:.3f}<extra></extra>",
        )
    )
    fig.update_layout(
        title="Mutual Information Matrix (captures non-linear relationships)",
        height=max(600, n_vars * 15),
        width=max(800, n_vars * 15),
        xaxis={"side": "bottom", "tickangle": 45},
        yaxis={"tickangle": 0},
    )
    return fig
 def create_feature_variance_plot(data_dict: dict[str, pd.Series]) -> go.Figure:
    """Create a bar plot showing variance/spread of each feature.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series with cell_ids as index
    Returns:
        Plotly Figure with variance bar plot
    """
    if len(data_dict) == 0:
        return go.Figure().add_annotation(
            text="No variables provided",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    # Calculate statistics for each variable
    stats_data = []
    for var_name, values in data_dict.items():
        clean_values = values.dropna()
        if len(clean_values) > 0:
            stats_data.append(
                {
                    "Variable": var_name,
                    "Std Dev": clean_values.std(),
                    "Variance": clean_values.var(),
                    "IQR": clean_values.quantile(0.75) - clean_values.quantile(0.25),
                    "Range": clean_values.max() - clean_values.min(),
                    "CV": clean_values.std() / (abs(clean_values.mean()) + 1e-10),
                }
            )
    stats_df = pd.DataFrame(stats_data).sort_values("Variance", ascending=False)
    # Create subplots for different variance metrics
    fig = make_subplots(
        rows=2,
        cols=2,
        subplot_titles=["Standard Deviation", "Interquartile Range (IQR)", "Range", "Coefficient of Variation"],
        vertical_spacing=0.12,
        horizontal_spacing=0.10,
    )
    metrics = [
        ("Std Dev", 1, 1),
        ("IQR", 1, 2),
        ("Range", 2, 1),
        ("CV", 2, 2),
    ]
    for metric, row, col in metrics:
        fig.add_trace(
            go.Bar(
                x=stats_df["Variable"],
                y=stats_df[metric],
                name=metric,
                marker={"color": stats_df[metric], "colorscale": "Viridis"},
                showlegend=False,
                hovertemplate=f"<b>%{{x}}</b><br>{metric}: %{{y:.3f}}<extra></extra>",
            ),
            row=row,
            col=col,
        )
        fig.update_xaxes(tickangle=45, row=row, col=col)
        fig.update_yaxes(title_text=metric, row=row, col=col)
    fig.update_layout(
        title="Feature Variance and Spread Metrics",
        height=800,
        showlegend=False,
    )
    return fig
--- a/src/entropice/dashboard/plots/experiment_comparison.py
+++ b/src/entropice/dashboard/plots/experiment_comparison.py
@ -0,0 +1,972 @@
 """Plots for experiment comparison and analysis."""
 from typing import TYPE_CHECKING
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 from entropice.dashboard.utils.colors import get_palette
 from entropice.dashboard.utils.formatters import format_metric_name
 if TYPE_CHECKING:
    import geopandas as gpd
    import pydeck as pdk
 def create_grid_level_comparison_plot(
    results_df: pd.DataFrame,
    metric: str,
    split: str = "test",
 ) -> go.Figure:
    """Create a plot comparing model performance across grid levels.
    Args:
        results_df: DataFrame with experiment results including grid, level, model, and metrics
        metric: Metric to compare (e.g., 'f1', 'accuracy', 'r2')
        split: Data split to show ('train', 'test', or 'combined')
    Returns:
        Plotly figure showing performance across grid levels
    """
    metric_col = f"{split}_{metric}"
    if metric_col not in results_df.columns:
        raise ValueError(f"Metric {metric_col} not found in results")
    # Create grid_level column for grouping
    results_df = results_df.copy()
    results_df["grid_level"] = results_df["grid"] + "_" + results_df["level"].astype(str)
    # Define the proper order for grid levels by resolution
    grid_level_order = [
        "hex_3",
        "healpix_6",
        "healpix_7",
        "hex_4",
        "healpix_8",
        "hex_5",
        "healpix_9",
        "healpix_10",
        "hex_6",
    ]
    # Create display labels for grid levels
    grid_level_display = {
        "hex_3": "Hex-3",
        "healpix_6": "Healpix-6",
        "healpix_7": "Healpix-7",
        "hex_4": "Hex-4",
        "healpix_8": "Healpix-8",
        "hex_5": "Hex-5",
        "healpix_9": "Healpix-9",
        "healpix_10": "Healpix-10",
        "hex_6": "Hex-6",
    }
    # Add display column
    results_df["grid_level_display"] = results_df["grid_level"].map(grid_level_display)
    # Create color map for target datasets (use 2nd color from 5-color palette for more saturation)
    unique_targets = results_df["target"].unique()
    target_colors = {target: get_palette(target, 5)[1] for target in unique_targets}
    # Create symbol map for models
    model_symbols = {
        "espa": "circle",
        "xgboost": "square",
        "rf": "diamond",
        "knn": "cross",
        "ensemble": "star",
        "autogluon": "star",
    }
    # Add a combined column for hover information
    results_df["model_display"] = results_df["model"].str.upper()
    # Create box plot without individual points first
    fig = px.box(
        results_df,
        x="grid_level_display",
        y=metric_col,
        color="target",
        facet_col="task",
        points=False,  # We'll add points separately with symbols
        title=f"{format_metric_name(metric)} by Grid Level ({split.capitalize()} Set)",
        labels={
            metric_col: format_metric_name(metric),
            "grid_level_display": "Grid Level",
            "target": "Target Dataset",
        },
        color_discrete_map=target_colors,
        category_orders={"grid_level_display": [grid_level_display[gl] for gl in grid_level_order]},
    )
    # Add scatter points with model-specific symbols
    # Group by task for faceting
    if "task" in results_df.columns:
        unique_tasks = results_df["task"].unique()
    else:
        unique_tasks = [None]
    # Add scatter traces and line traces for each target-model combination
    for target in unique_targets:
        target_data = results_df[results_df["target"] == target]
        for model in target_data["model"].unique():
            model_data = target_data[target_data["model"] == model]
            symbol = model_symbols.get(model, "circle")
            # Add scatter trace for each task facet
            for task_idx, task in enumerate(unique_tasks):
                if task is not None:
                    task_data = model_data[model_data["task"] == task]
                else:
                    task_data = model_data
                if len(task_data) == 0:
                    continue
                # Sort by grid_level order for proper line connections
                task_data["grid_level_order"] = task_data["grid_level"].map(
                    {gl: i for i, gl in enumerate(grid_level_order)}
                )
                task_data = task_data.sort_values("grid_level_order")
                # Determine which subplot this goes to
                row = 1
                col = task_idx + 1 if task is not None else 1
                # Add line trace connecting points of the same model
                line = go.Scatter(
                    x=task_data["grid_level_display"],
                    y=task_data[metric_col],
                    mode="lines",
                    line={
                        "color": target_colors[target],
                        "width": 1.5,
                        "dash": "dot",
                    },
                    showlegend=False,
                    hoverinfo="skip",
                    xaxis=f"x{col}" if col > 1 else "x",
                    yaxis=f"y{col}" if col > 1 else "y",
                )
                fig.add_trace(line, row=row, col=col)
                # Add scatter trace on top of the line
                scatter = go.Scatter(
                    x=task_data["grid_level_display"],
                    y=task_data[metric_col],
                    mode="markers",
                    marker={
                        "symbol": symbol,
                        "size": 8,
                        "color": target_colors[target],
                        "line": {"width": 1, "color": "white"},
                    },
                    name=f"{target} ({model.upper()})",
                    legendgroup=target,
                    showlegend=(task_idx == 0),  # Only show in legend once
                    hovertemplate=(
                        f"<b>{target}</b><br>"
                        f"Model: {model.upper()}<br>"
                        f"Grid Level: %{{x}}<br>"
                        f"{format_metric_name(metric)}: %{{y:.4f}}<br>"
                        "<extra></extra>"
                    ),
                    xaxis=f"x{col}" if col > 1 else "x",
                    yaxis=f"y{col}" if col > 1 else "y",
                )
                fig.add_trace(scatter, row=row, col=col)
    fig.update_layout(
        height=500,
        showlegend=True,
        hovermode="closest",
    )
    return fig
 def create_model_ranking_plot(
    results_df: pd.DataFrame,
    metric: str,
    split: str = "test",
    top_n: int = 10,
 ) -> go.Figure:
    """Create a plot ranking models by performance.
    Args:
        results_df: DataFrame with experiment results
        metric: Metric to rank by
        split: Data split to show
        top_n: Number of top models to show
    Returns:
        Plotly figure showing top models
    """
    metric_col = f"{split}_{metric}"
    if metric_col not in results_df.columns:
        raise ValueError(f"Metric {metric_col} not found in results")
    # Get top N models
    top_models = results_df.nlargest(top_n, metric_col)
    # Create display label
    top_models = top_models.copy()
    top_models["model_label"] = (
        top_models["model"].astype(str)
        + " ("
        + top_models["grid"]
        + "_"
        + top_models["level"].astype(str)
        + ", "
        + top_models["task"]
        + ")"
    )
    colors = get_palette("models", len(top_models))
    fig = go.Figure(
        data=[
            go.Bar(
                x=top_models[metric_col],
                y=top_models["model_label"],
                orientation="h",
                marker={"color": colors},
                text=top_models[metric_col].round(4),
                textposition="auto",
            )
        ]
    )
    fig.update_layout(
        title=f"Top {top_n} Models by {format_metric_name(metric)} ({split.capitalize()} Set)",
        xaxis_title=format_metric_name(metric),
        yaxis_title="Model Configuration",
        height=max(400, top_n * 40),
        showlegend=False,
    )
    return fig
 def create_model_consistency_heatmap(
    results_df: pd.DataFrame,
    metric: str,
    split: str = "test",
 ) -> go.Figure:
    """Create a heatmap showing model consistency across tasks and targets.
    Args:
        results_df: DataFrame with experiment results
        metric: Metric to analyze
        split: Data split to show
    Returns:
        Plotly figure showing heatmap of model performance
    """
    metric_col = f"{split}_{metric}"
    if metric_col not in results_df.columns:
        raise ValueError(f"Metric {metric_col} not found in results")
    # Create a pivot table: models vs (task, target)
    results_df = results_df.copy()
    results_df["task_target"] = results_df["task"] + "_" + results_df["target"]
    # Get best score per model-task-target combination
    pivot_data = results_df.groupby(["model", "task_target"])[metric_col].max().reset_index()
    pivot_table = pivot_data.pivot_table(index="model", columns="task_target", values=metric_col)
    fig = go.Figure(
        data=go.Heatmap(
            z=pivot_table.to_numpy(),
            x=pivot_table.columns,
            y=pivot_table.index,
            colorscale="Viridis",
            text=pivot_table.to_numpy().round(3),
            texttemplate="%{text}",
            textfont={"size": 10},
            colorbar={"title": format_metric_name(metric)},
        )
    )
    fig.update_layout(
        title=f"Model Consistency: {format_metric_name(metric)} Across Tasks and Targets ({split.capitalize()} Set)",
        xaxis_title="Task_Target",
        yaxis_title="Model",
        height=max(400, len(pivot_table.index) * 50),
    )
    return fig
 def create_feature_importance_comparison_plot(
    feature_importance_df: pd.DataFrame,
    top_n: int = 20,
 ) -> go.Figure:
    """Create a plot comparing feature importance across different models/configurations.
    Args:
        feature_importance_df: DataFrame with columns: feature, importance, model, grid, level, task
        top_n: Number of top features to show
    Returns:
        Plotly figure showing feature importance comparison
    """
    # Get top features overall
    overall_importance = feature_importance_df.groupby("feature")["importance"].mean().reset_index()
    top_features = overall_importance.nlargest(top_n, "importance")["feature"].tolist()
    # Filter to top features
    filtered_df = feature_importance_df[feature_importance_df["feature"].isin(top_features)]
    # Create grid_level column
    filtered_df = filtered_df.copy()
    filtered_df["config"] = (
        filtered_df["model"].astype(str)
        + " ("
        + filtered_df["grid"]
        + "_"
        + filtered_df["level"].astype(str)
        + ", "
        + filtered_df["task"]
        + ")"
    )
    # Create grouped bar chart
    fig = px.bar(
        filtered_df,
        x="feature",
        y="importance",
        color="config",
        barmode="group",
        title=f"Top {top_n} Features: Importance Comparison Across Configurations",
        labels={"importance": "Feature Importance", "feature": "Feature", "config": "Configuration"},
    )
    fig.update_layout(
        height=600,
        xaxis_tickangle=-45,
        showlegend=True,
        legend={"orientation": "v", "yanchor": "top", "y": 1, "xanchor": "left", "x": 1.02},
    )
    return fig
 def create_feature_importance_heatmap(
    feature_importance_df: pd.DataFrame,
    top_n: int = 30,
 ) -> go.Figure:
    """Create a heatmap of feature importance across configurations.
    Args:
        feature_importance_df: DataFrame with columns: feature, importance, model, grid, level, task
        top_n: Number of top features to show
    Returns:
        Plotly figure showing heatmap
    """
    # Get top features overall
    overall_importance = feature_importance_df.groupby("feature")["importance"].mean().reset_index()
    top_features = overall_importance.nlargest(top_n, "importance")["feature"].tolist()
    # Filter to top features
    filtered_df = feature_importance_df[feature_importance_df["feature"].isin(top_features)]
    # Create config column
    filtered_df = filtered_df.copy()
    filtered_df["config"] = (
        filtered_df["grid"]
        + "_"
        + filtered_df["level"].astype(str)
        + "_"
        + filtered_df["task"]
        + "_"
        + filtered_df["model"].astype(str)
    )
    # Pivot: features vs configs
    pivot_table = filtered_df.pivot_table(
        index="feature", columns="config", values="importance", aggfunc="mean", fill_value=0
    )
    fig = go.Figure(
        data=go.Heatmap(
            z=pivot_table.to_numpy(),
            x=pivot_table.columns,
            y=pivot_table.index,
            colorscale="Viridis",
            colorbar={"title": "Importance"},
        )
    )
    fig.update_layout(
        title=f"Top {top_n} Features: Importance Heatmap Across Configurations",
        xaxis_title="Configuration",
        yaxis_title="Feature",
        height=max(500, len(pivot_table.index) * 20),
        xaxis_tickangle=-45,
    )
    return fig
 def create_model_performance_distribution(
    results_df: pd.DataFrame,
    metric: str,
    split: str = "test",
 ) -> go.Figure:
    """Create violin plots showing performance distribution by model type.
    Args:
        results_df: DataFrame with experiment results
        metric: Metric to analyze
        split: Data split to show
    Returns:
        Plotly figure with violin plots
    """
    metric_col = f"{split}_{metric}"
    if metric_col not in results_df.columns:
        raise ValueError(f"Metric {metric_col} not found in results")
    colors = get_palette("models", results_df["model"].nunique())
    color_map = {model: colors[i] for i, model in enumerate(results_df["model"].unique())}
    fig = px.violin(
        results_df,
        x="model",
        y=metric_col,
        color="model",
        box=True,
        points="all",
        title=f"{format_metric_name(metric)} Distribution by Model Type ({split.capitalize()} Set)",
        labels={metric_col: format_metric_name(metric), "model": "Model"},
        color_discrete_map=color_map,
    )
    fig.update_layout(
        height=500,
        showlegend=False,
    )
    return fig
 def create_grid_vs_model_performance(
    results_df: pd.DataFrame,
    metric: str,
    split: str = "test",
 ) -> go.Figure:
    """Create a faceted plot showing model performance across grids.
    Args:
        results_df: DataFrame with experiment results
        metric: Metric to analyze
        split: Data split to show
    Returns:
        Plotly figure with faceted plots
    """
    metric_col = f"{split}_{metric}"
    if metric_col not in results_df.columns:
        raise ValueError(f"Metric {metric_col} not found in results")
    results_df = results_df.copy()
    results_df["grid_level"] = results_df["grid"] + "_" + results_df["level"].astype(str)
    fig = px.box(
        results_df,
        x="model",
        y=metric_col,
        color="model",
        facet_col="grid_level",
        facet_row="task",
        points="all",
        title=f"{format_metric_name(metric)}: Model Performance Across Grid Levels and Tasks ({split.capitalize()})",
        labels={metric_col: format_metric_name(metric), "model": "Model"},
    )
    fig.update_layout(
        height=800,
        showlegend=True,
    )
    return fig
 def create_performance_improvement_plot(
    results_df: pd.DataFrame,
    metric: str,
    baseline_model: str = "rf",
    split: str = "test",
 ) -> go.Figure:
    """Create a plot showing performance improvement relative to a baseline model.
    Args:
        results_df: DataFrame with experiment results
        metric: Metric to analyze
        baseline_model: Model to use as baseline for comparison
        split: Data split to show
    Returns:
        Plotly figure showing improvement over baseline
    """
    metric_col = f"{split}_{metric}"
    if metric_col not in results_df.columns:
        raise ValueError(f"Metric {metric_col} not found in results")
    results_df = results_df.copy()
    # Calculate baseline performance for each task-target-grid-level combination
    baseline_perf = (
        results_df[results_df["model"] == baseline_model]
        .groupby(["task", "target", "grid", "level"])[metric_col]
        .max()
        .reset_index()
        .rename(columns={metric_col: "baseline_score"})
    )
    # Merge with all results
    results_with_baseline = results_df.merge(baseline_perf, on=["task", "target", "grid", "level"], how="left")
    # Calculate improvement
    results_with_baseline["improvement"] = results_with_baseline[metric_col] - results_with_baseline["baseline_score"]
    results_with_baseline["improvement_pct"] = (
        100 * results_with_baseline["improvement"] / results_with_baseline["baseline_score"]
    )
    # Filter out baseline model itself
    results_with_baseline = results_with_baseline[results_with_baseline["model"] != baseline_model]
    # Create plot
    fig = px.box(
        results_with_baseline,
        x="model",
        y="improvement_pct",
        color="model",
        points="all",
        title=(
            f"Performance Improvement Over {baseline_model.upper()} Baseline "
            f"({format_metric_name(metric)}, {split.capitalize()} Set)"
        ),
        labels={"improvement_pct": "Improvement (%)", "model": "Model"},
    )
    fig.add_hline(y=0, line_dash="dash", line_color="gray", annotation_text="Baseline")
    fig.update_layout(
        height=500,
        showlegend=False,
    )
    return fig
 def create_top_models_bar_chart(
    task_df: pd.DataFrame,
    metric: str,
    task_name: str,
    split: str = "test",
    top_n: int = 10,
 ) -> go.Figure:
    """Create a horizontal bar chart showing top models for a task.
    Args:
        task_df: DataFrame filtered for a specific task
        metric: Metric to display
        task_name: Name of the task for the title
        split: Data split to show
        top_n: Number of top models to show
    Returns:
        Plotly figure with horizontal bar chart
    """
    metric_col = f"{split}_{metric}"
    if metric_col not in task_df.columns:
        raise ValueError(f"Metric {metric_col} not found in data")
    # Get top N models
    top_models = task_df.nlargest(top_n, metric_col).copy()
    # Create label combining model, grid level, and target
    top_models["label"] = (
        top_models["model"].str.upper()
        + " ("
        + top_models["grid"]
        + "-"
        + top_models["level"].astype(str)
        + ", "
        + top_models["target"]
        + ")"
    )
    # Sort by score for display (ascending so best is on top)
    top_models = top_models.sort_values(metric_col, ascending=True)
    # Create color map based on model type
    unique_models = top_models["model"].unique()
    model_colors = {model: get_palette("models", len(unique_models))[i] for i, model in enumerate(unique_models)}
    top_models["color"] = top_models["model"].map(model_colors)
    fig = go.Figure(
        data=[
            go.Bar(
                y=top_models["label"],
                x=top_models[metric_col],
                orientation="h",
                marker={"color": top_models["color"]},
                text=top_models[metric_col].round(4),
                textposition="auto",
                hovertemplate=("<b>%{y}</b><br>" + f"{format_metric_name(metric)}: %{{x:.4f}}<br>" + "<extra></extra>"),
            )
        ]
    )
    fig.update_layout(
        title=f"Top {top_n} Models - {task_name.replace('_', ' ').title()}",
        xaxis_title=format_metric_name(metric),
        yaxis_title="",
        height=max(400, top_n * 40),
        showlegend=False,
        margin={"l": 250},
    )
    return fig
 def create_feature_importance_by_grid_level(
    fi_df: pd.DataFrame,
    top_n: int = 15,
 ) -> go.Figure:
    """Create a grouped bar chart showing top features by grid level.
    Args:
        fi_df: Feature importance DataFrame with columns: feature, importance, grid_level
        top_n: Number of top features to show
    Returns:
        Plotly figure with grouped bar chart
    """
    # Get overall top features
    overall_top = fi_df.groupby("feature")["importance"].mean().nlargest(top_n).index.tolist()
    # Filter to top features
    filtered = fi_df[fi_df["feature"].isin(overall_top)]
    # Calculate mean importance per feature per grid level
    grouped = filtered.groupby(["grid_level", "feature"])["importance"].mean().reset_index()
    # Create the plot
    fig = px.bar(
        grouped,
        x="feature",
        y="importance",
        color="grid_level",
        barmode="group",
        title=f"Top {top_n} Features by Grid Level",
        labels={"importance": "Mean Importance", "feature": "Feature", "grid_level": "Grid Level"},
    )
    fig.update_layout(
        height=500,
        xaxis_tickangle=-45,
        showlegend=True,
        legend={"orientation": "v", "yanchor": "top", "y": 1, "xanchor": "left", "x": 1.02},
    )
    return fig
 def create_feature_consistency_plot(
    fi_df: pd.DataFrame,
    top_n: int = 15,
 ) -> go.Figure:
    """Create a scatter plot showing feature importance vs consistency.
    Args:
        fi_df: Feature importance DataFrame
        top_n: Number of top features to show
    Returns:
        Plotly figure with scatter plot
    """
    # Get top features
    overall_top = fi_df.groupby("feature")["importance"].mean().nlargest(top_n).index.tolist()
    # Calculate statistics
    stats = (
        fi_df[fi_df["feature"].isin(overall_top)].groupby("feature")["importance"].agg(["mean", "std"]).reset_index()
    )
    stats["cv"] = stats["std"] / stats["mean"]
    # Add data source if available
    if "data_source" in fi_df.columns:
        data_source_map = fi_df[["feature", "data_source"]].drop_duplicates().set_index("feature")["data_source"]
        stats["data_source"] = stats["feature"].map(data_source_map)
        color_col = "data_source"
    else:
        color_col = None
    fig = px.scatter(
        stats,
        x="cv",
        y="mean",
        size="std",
        color=color_col,
        hover_data=["feature"],
        text="feature",
        title="Feature Importance vs Consistency (Coefficient of Variation)",
        labels={
            "cv": "Coefficient of Variation (lower = more consistent)",
            "mean": "Mean Importance",
            "std": "Std Dev",
        },
    )
    fig.update_traces(textposition="top center", textfont_size=8)
    fig.update_layout(
        height=600,
        showlegend=True,
    )
    return fig
 def create_data_source_importance_bars(
    fi_df: pd.DataFrame,
 ) -> go.Figure:
    """Create a bar chart showing importance breakdown by data source.
    Args:
        fi_df: Feature importance DataFrame with data_source column
    Returns:
        Plotly figure with bar chart
    """
    if "data_source" not in fi_df.columns:
        raise ValueError("data_source column not found in feature importance data")
    # Aggregate by data source
    source_stats = (
        fi_df.groupby("data_source")["importance"]
        .agg(["sum", "mean", "count"])
        .reset_index()
        .sort_values("sum", ascending=False)
    )
    # Create colors for data sources
    colors = get_palette("sources", len(source_stats))
    fig = go.Figure(
        data=[
            go.Bar(
                x=source_stats["data_source"],
                y=source_stats["sum"],
                marker={"color": colors},
                text=source_stats["sum"].round(2),
                textposition="auto",
                hovertemplate=(
                    "<b>%{x}</b><br>"
                    "Total Importance: %{y:.2f}<br>"
                    "Mean: %{customdata[0]:.4f}<br>"
                    "Features: %{customdata[1]}<br>"
                    "<extra></extra>"
                ),
                customdata=source_stats[["mean", "count"]].values,
            )
        ]
    )
    fig.update_layout(
        title="Feature Importance by Data Source",
        xaxis_title="Data Source",
        yaxis_title="Total Importance",
        height=400,
        showlegend=False,
    )
    return fig
 def create_inference_maps(
    inference_gdf: "gpd.GeoDataFrame",
    grid: str,
    level: int,
    task: str,
 ) -> tuple["pdk.Deck", "pdk.Deck"]:
    """Create inference maps showing mean and std of predictions using pydeck.
    Args:
        inference_gdf: GeoDataFrame with geometry, mean_prediction, std_prediction columns
        grid: Grid type (e.g., 'hex', 'healpix')
        level: Grid resolution level
        task: Task type for title
    Returns:
        Tuple of (mean_deck, std_deck) pydeck visualizations
    """
    import matplotlib.colors as mcolors
    import numpy as np
    import pydeck as pdk
    from matplotlib.colors import LinearSegmentedColormap
    from entropice.dashboard.utils.colors import hex_to_rgb
    from entropice.dashboard.utils.geometry import fix_hex_geometry
    # Create a copy and convert to EPSG:4326 for pydeck
    gdf = inference_gdf.copy().to_crs("EPSG:4326")
    # Fix antimeridian issues for hex cells
    gdf["geometry"] = gdf["geometry"].apply(fix_hex_geometry)
    # Create custom colormap for predictions (white -> blue -> purple)
    colors_mean = ["#f7fbff", "#deebf7", "#c6dbef", "#9ecae1", "#6baed6", "#4292c6", "#2171b5", "#08519c", "#08306b"]
    cmap_mean = LinearSegmentedColormap.from_list("prediction", colors_mean)
    colors_std = ["#f7fcf5", "#e5f5e0", "#c7e9c0", "#a1d99b", "#74c476", "#41ab5d", "#238b45", "#006d2c", "#00441b"]
    cmap_std = LinearSegmentedColormap.from_list("uncertainty", colors_std)
    # Normalize mean predictions
    mean_values = gdf["mean_prediction"].to_numpy()
    mean_min, mean_max = np.nanpercentile(mean_values, [2, 98])
    if mean_max > mean_min:
        mean_normalized = np.clip((mean_values - mean_min) / (mean_max - mean_min), 0, 1)
    else:
        mean_normalized = np.zeros_like(mean_values)
    # Map normalized values to colors for mean
    mean_colors = [cmap_mean(val) for val in mean_normalized]
    mean_rgb_colors = [hex_to_rgb(mcolors.to_hex(color)) for color in mean_colors]
    gdf["mean_color"] = mean_rgb_colors
    # Normalize std predictions
    std_values = gdf["std_prediction"].to_numpy()
    std_min, std_max = np.nanpercentile(std_values, [2, 98])
    if std_max > std_min:
        std_normalized = np.clip((std_values - std_min) / (std_max - std_min), 0, 1)
    else:
        std_normalized = np.zeros_like(std_values)
    # Map normalized values to colors for std
    std_colors = [cmap_std(val) for val in std_normalized]
    std_rgb_colors = [hex_to_rgb(mcolors.to_hex(color)) for color in std_colors]
    gdf["std_color"] = std_rgb_colors
    # Convert to GeoJSON for mean predictions
    geojson_mean = []
    for _, row in gdf.iterrows():
        feature = {
            "type": "Feature",
            "geometry": row["geometry"].__geo_interface__,
            "properties": {
                "fill_color": row["mean_color"],
                "mean_prediction": float(row["mean_prediction"]),
            },
        }
        geojson_mean.append(feature)
    # Convert to GeoJSON for std predictions
    geojson_std = []
    for _, row in gdf.iterrows():
        feature = {
            "type": "Feature",
            "geometry": row["geometry"].__geo_interface__,
            "properties": {
                "fill_color": row["std_color"],
                "std_prediction": float(row["std_prediction"]),
            },
        }
        geojson_std.append(feature)
    # Create pydeck layer for mean predictions
    layer_mean = pdk.Layer(
        "GeoJsonLayer",
        geojson_mean,
        opacity=0.85,
        stroked=True,
        filled=True,
        extruded=False,
        wireframe=False,
        get_fill_color="properties.fill_color",
        get_line_color=[100, 100, 100],
        line_width_min_pixels=0.3,
        pickable=True,
    )
    # Create pydeck layer for std predictions
    layer_std = pdk.Layer(
        "GeoJsonLayer",
        geojson_std,
        opacity=0.85,
        stroked=True,
        filled=True,
        extruded=False,
        wireframe=False,
        get_fill_color="properties.fill_color",
        get_line_color=[100, 100, 100],
        line_width_min_pixels=0.3,
        pickable=True,
    )
    # Set initial view state for Arctic region
    view_state = pdk.ViewState(
        latitude=75,
        longitude=0,
        zoom=2.5,
        pitch=0,
    )
    # Create deck for mean predictions
    deck_mean = pdk.Deck(
        layers=[layer_mean],
        initial_view_state=view_state,
        tooltip={
            "html": (
                f"<b>Mean Prediction:</b> {{mean_prediction}}<br>"
                f"<b>{grid.upper()}-{level}</b> | <b>Task:</b> {task.title()}"
            ),
            "style": {"backgroundColor": "#2171b5", "color": "white"},
        },
        map_style="https://basemaps.cartocdn.com/gl/positron-gl-style/style.json",
    )
    # Create deck for std predictions
    deck_std = pdk.Deck(
        layers=[layer_std],
        initial_view_state=view_state,
        tooltip={
            "html": (
                f"<b>Uncertainty (Std):</b> {{std_prediction}}<br>"
                f"<b>{grid.upper()}-{level}</b> | <b>Task:</b> {task.title()}"
            ),
            "style": {"backgroundColor": "#238b45", "color": "white"},
        },
        map_style="https://basemaps.cartocdn.com/gl/positron-gl-style/style.json",
    )
    return deck_mean, deck_std
--- a/src/entropice/dashboard/plots/inference.py
+++ b/src/entropice/dashboard/plots/inference.py
@ -6,10 +6,74 @@ import plotly.graph_objects as go
 import pydeck as pdk
 import streamlit as st
 from entropice.dashboard.utils.class_ordering import get_ordered_classes, sort_class_series
 from entropice.dashboard.utils.colors import get_palette
 from entropice.dashboard.utils.geometry import fix_hex_geometry
 from entropice.dashboard.utils.loaders import TrainingResult
 from entropice.utils.types import Task
 # Canonical orderings imported from the ML pipeline
 # Binary labels are defined inline in dataset.py: {False: "No RTS", True: "RTS"}
 # Count/Density labels are defined in the bin_values function
 BINARY_LABELS = ["No RTS", "RTS"]
 COUNT_LABELS = ["None", "Very Few", "Few", "Several", "Many", "Very Many"]
 DENSITY_LABELS = ["Empty", "Very Sparse", "Sparse", "Moderate", "Dense", "Very Dense"]
 CLASS_ORDERINGS: dict[Task | str, list[str]] = {
    "binary": BINARY_LABELS,
    "count_regimes": COUNT_LABELS,
    "density_regimes": DENSITY_LABELS,
    # Legacy aliases (deprecated)
    "count": COUNT_LABELS,
    "density": DENSITY_LABELS,
 }
 def get_ordered_classes(task: Task | str, available_classes: list[str] | None = None) -> list[str]:
    """Get properly ordered class labels for a given task.
    This uses the same canonical ordering as defined in the ML dataset module,
    ensuring consistency between training and inference visualizations.
    Args:
        task: Task type ('binary', 'count_regimes', 'density_regimes', 'count', 'density').
        available_classes: Optional list of available classes to filter and order.
                          If None, returns all canonical classes for the task.
    Returns:
        List of class labels in proper order.
    Examples:
        >>> get_ordered_classes("binary")
        ['No RTS', 'RTS']
        >>> get_ordered_classes("count_regimes", ["None", "Few", "Several"])
        ['None', 'Few', 'Several']
    """
    canonical_order = CLASS_ORDERINGS[task]
    if available_classes is None:
        return canonical_order
    # Filter canonical order to only include available classes, preserving order
    return [cls for cls in canonical_order if cls in available_classes]
 def sort_class_series(series: pd.Series, task: Task | str) -> pd.Series:
    """Sort a pandas Series with class labels according to canonical ordering.
    Args:
        series: Pandas Series with class labels as index.
        task: Task type ('binary', 'count_regimes', 'density_regimes', 'count', 'density').
    Returns:
        Sorted Series with classes in canonical order.
    """
    available_classes = series.index.tolist()
    ordered_classes = get_ordered_classes(task, available_classes)
    # Reindex to get proper order
    return series.reindex(ordered_classes)
 def render_inference_statistics(predictions_gdf: gpd.GeoDataFrame, task: str):
@ -60,7 +124,7 @@ def render_class_distribution_histogram(predictions_gdf: gpd.GeoDataFrame, task:
    Args:
        predictions_gdf: GeoDataFrame with predictions.
-        task: Task type ('binary', 'count', 'density').
+        task: Task type ('binary', 'count_regimes', 'density_regimes', 'count', 'density').
    """
    st.subheader("📊 Predicted Class Distribution")
@ -348,7 +412,7 @@ def render_class_comparison(predictions_gdf: gpd.GeoDataFrame, task: str):
    Args:
        predictions_gdf: GeoDataFrame with predictions.
-        task: Task type ('binary', 'count', 'density').
+        task: Task type ('binary', 'count_regimes', 'density_regimes', 'count', 'density').
    """
    st.subheader("🔍 Class Comparison")
--- a/src/entropice/dashboard/sections/init.py
+++ b/src/entropice/dashboard/sections/init.py
--- a/src/entropice/dashboard/sections/correlations.py
+++ b/src/entropice/dashboard/sections/correlations.py
@ -0,0 +1,674 @@
 """Cross-dataset correlation analysis dashboard section."""
 from typing import Any, cast
 import pandas as pd
 import streamlit as st
 import xarray as xr
 from entropice.dashboard.plots.correlations import (
    create_dendrogram_plot,
    create_feature_variance_plot,
    create_full_correlation_heatmap,
    create_matplotlib_correlation_heatmap,
    create_mutual_information_matrix,
    create_pca_biplot,
    select_top_variable_features,
 )
 from entropice.utils.types import L2SourceDataset
 def _get_aggregation_dimensions(ds: xr.Dataset) -> list[str]:
    """Get aggregation dimension names from a dataset.
    Args:
        ds: Xarray Dataset
    Returns:
        List of aggregation dimension names
    """
    return [str(dim) for dim in ds.dims if dim in ("agg", "aggregations")]
 def _set_all_aggregations_corr(
    member_datasets: dict[L2SourceDataset, xr.Dataset],
    members_with_aggs: list[L2SourceDataset],
    member_agg_dims: dict[L2SourceDataset, list[str]],
    selected: bool,
 ):
    """Set all aggregation checkboxes to selected or deselected state.
    Args:
        member_datasets: Dictionary mapping members to their loaded datasets
        members_with_aggs: List of members that have aggregations
        member_agg_dims: Dictionary mapping members to aggregation dimensions
        selected: True to select all, False to deselect all
    """
    for member in members_with_aggs:
        ds = member_datasets.get(member)
        if ds is None:
            continue
        agg_dims = member_agg_dims.get(member, [])
        for agg_dim in agg_dims:
            if agg_dim in ds.dims:
                agg_values = ds.coords[agg_dim].to_numpy().tolist()
                for val in agg_values:
                    st.session_state[f"corr_agg_{member}_{agg_dim}_{val}"] = selected
 def _set_median_only_aggregations_corr(
    member_datasets: dict[L2SourceDataset, xr.Dataset],
    members_with_aggs: list[L2SourceDataset],
    member_agg_dims: dict[L2SourceDataset, list[str]],
 ):
    """Set only median aggregations to selected, deselect all others.
    Args:
        member_datasets: Dictionary mapping members to their loaded datasets
        members_with_aggs: List of members that have aggregations
        member_agg_dims: Dictionary mapping members to aggregation dimensions
    """
    for member in members_with_aggs:
        ds = member_datasets.get(member)
        if ds is None:
            continue
        agg_dims = member_agg_dims.get(member, [])
        for agg_dim in agg_dims:
            if agg_dim in ds.dims:
                agg_values = ds.coords[agg_dim].to_numpy().tolist()
                for val in agg_values:
                    # Select only if value is or contains 'median'
                    is_median = str(val).lower() == "median" or "median" in str(val).lower()
                    st.session_state[f"corr_agg_{member}_{agg_dim}_{val}"] = is_median
 def _render_member_aggregation_checkboxes(
    member: L2SourceDataset,
    ds: xr.Dataset,
    agg_dims: list[str],
    column: Any,
    dimension_filters: dict[str, dict[str, list[str]]],
 ) -> dict[str, dict[str, list[str]]]:
    """Render checkboxes for a single member's aggregations."""
    with column:
        st.markdown(f"**{member}:**")
        for agg_dim in agg_dims:
            if agg_dim not in ds.dims:
                continue
            # Get coordinate values
            agg_values = [str(v) for v in ds.coords[agg_dim].to_numpy().tolist()]
            st.markdown(f"*{agg_dim}:*")
            selected_vals = []
            for val in agg_values:
                key = f"corr_agg_{member}_{agg_dim}_{val}"
                if st.checkbox(val, value=True, key=key, help=f"Include {val} from {agg_dim}"):
                    selected_vals.append(val)
            # Store selected values if not all selected
            if selected_vals and len(selected_vals) < len(agg_values):
                if member not in dimension_filters:
                    dimension_filters[member] = {}
                dimension_filters[member][agg_dim] = selected_vals
    return dimension_filters
 def _render_aggregation_selection(
    member_datasets: dict[L2SourceDataset, xr.Dataset],
 ) -> dict[str, dict[str, list[str]]]:
    """Render aggregation selection controls for members that have aggregations.
    Args:
        member_datasets: Dictionary mapping member names to their xarray Datasets
    Returns:
        Dictionary mapping member names to dimension filters
    """
    # Find members with aggregations
    members_with_aggs = []
    member_agg_dims = {}
    for member, ds in member_datasets.items():
        agg_dims = _get_aggregation_dimensions(ds)
        if agg_dims:
            members_with_aggs.append(member)
            member_agg_dims[member] = agg_dims
    if not members_with_aggs:
        return {}
    st.markdown("#### Select Aggregations")
    st.markdown(
        "Select which spatial aggregations to include. "
        "This allows you to analyze correlations for specific aggregation types (e.g., mean, median, std)."
    )
    # Add buttons for common selections (outside form to allow state manipulation)
    col_btn1, col_btn2, col_btn3, _ = st.columns([1, 1, 1, 3])
    with col_btn1:
        if st.button("✅ Select All", use_container_width=True, key="corr_agg_select_all"):
            _set_all_aggregations_corr(member_datasets, members_with_aggs, member_agg_dims, selected=True)
    with col_btn2:
        if st.button("📊 Median Only", use_container_width=True, key="corr_agg_median_only"):
            _set_median_only_aggregations_corr(member_datasets, members_with_aggs, member_agg_dims)
    with col_btn3:
        if st.button("❌ Deselect All", use_container_width=True, key="corr_agg_deselect_all"):
            _set_all_aggregations_corr(member_datasets, members_with_aggs, member_agg_dims, selected=False)
    # Render form with checkboxes
    with st.form("correlation_aggregation_selection_form"):
        dimension_filters: dict[str, dict[str, list[str]]] = {}
        # Create columns for each member
        member_cols = st.columns(len(members_with_aggs))
        for col_idx, member in enumerate(members_with_aggs):
            dimension_filters = _render_member_aggregation_checkboxes(
                member,
                member_datasets[member],
                member_agg_dims[member],
                member_cols[col_idx],
                dimension_filters,
            )
        # Submit button for the form
        submitted = st.form_submit_button("Apply Aggregation Filters", type="primary", use_container_width=True)
        if not submitted:
            st.info("👆 Click 'Apply Aggregation Filters' to update the configuration")
            st.stop()
    return dimension_filters
 def _flatten_dataset_to_series(
    ds: xr.Dataset,
    prefix: str = "",
    dimension_filter: dict[str, list[str]] | None = None,
 ) -> dict[str, pd.Series]:
    """Flatten an xarray Dataset into a dict of pandas Series.
    Handles multi-dimensional variables by creating separate series for each combination
    of non-cell_ids dimensions.
    Args:
        ds: Xarray Dataset to flatten
        prefix: Prefix to add to variable names
        dimension_filter: Optional dict mapping dimension names to lists of allowed values.
            Only combinations matching these values will be included.
    Returns:
        Dictionary mapping variable names to pandas Series with cell_ids as index
    """
    series_dict = {}
    dimension_filter = dimension_filter or {}
    for var_name in ds.data_vars:
        var_data = ds[var_name]
        # Get dimensions other than cell_ids
        other_dims = [dim for dim in var_data.dims if dim != "cell_ids"]
        if len(other_dims) == 0:
            # Simple 1D variable
            series = var_data.to_series()
            full_name = f"{prefix}{var_name}" if prefix else var_name
            series_dict[full_name] = series
        else:
            # Multi-dimensional variable - create series for each combination
            for coord_values in var_data.stack(stacked=other_dims).coords["stacked"].to_numpy():  # noqa: PD013
                # Create selector dict
                if isinstance(coord_values, tuple):
                    selector = dict(zip(other_dims, coord_values))
                else:
                    selector = {other_dims[0]: coord_values}
                # Check if this combination passes the dimension filter
                if dimension_filter:
                    skip = False
                    for dim, val in selector.items():
                        if dim in dimension_filter:
                            # Check if value is in allowed list (convert to string for comparison)
                            if str(val) not in dimension_filter[dim]:
                                skip = True
                                break
                    if skip:
                        continue
                # Extract 1D series
                series = var_data.sel(selector).to_series()
                # Create descriptive name
                suffix = "_".join(str(v) for v in (coord_values if isinstance(coord_values, tuple) else [coord_values]))
                full_name = f"{prefix}{var_name}_{suffix}" if prefix else f"{var_name}_{suffix}"
                series_dict[full_name] = series
    return series_dict
@st.fragment
 def _render_correlation_matrix(data_dict: dict[str, pd.Series]):
    """Render correlation matrix visualization.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series
    """
    st.subheader("Correlation Matrix")
    n_features = len(data_dict)
    # Add feature reduction options for large datasets
    reduced_data = data_dict
    if n_features > 500:
        st.warning(
            f"⚠️ **Large dataset detected:** {n_features} features may be too many to visualize effectively. "
            "Consider reducing the feature set using the options below."
        )
        with st.expander("🎯 Feature Reduction Options", expanded=True):
            reduction_method = st.radio(
                "Reduction Strategy",
                options=["top_variable", "none"],
                format_func=lambda x: (
                    "Select Most Variable Features" if x == "top_variable" else "Use All Features (may be slow)"
                ),
                index=0,
                key="corr_reduction_method",
            )
            if reduction_method == "top_variable":
                col1, col2 = st.columns(2)
                with col1:
                    n_top = st.slider(
                        "Number of Features to Keep",
                        min_value=50,
                        max_value=min(1000, n_features),
                        value=min(500, n_features),
                        step=50,
                        key="corr_n_top",
                    )
                with col2:
                    variability_metric = st.selectbox(
                        "Variability Metric",
                        options=["variance", "iqr", "cv"],
                        format_func=lambda x: {
                            "variance": "Variance",
                            "iqr": "Interquartile Range",
                            "cv": "Coefficient of Variation",
                        }[x],
                        key="corr_variability_metric",
                    )
                reduced_data = select_top_variable_features(data_dict, n_features=n_top, method=variability_metric)
                st.success(f"✅ Reduced from {n_features} to {len(reduced_data)} most variable features")
    n_reduced = len(reduced_data)
    # Configuration options
    col1, col2 = st.columns([3, 1])
    with col1:
        method = st.selectbox(
            "Correlation Method",
            options=["pearson", "spearman", "kendall"],
            index=0,
            format_func=lambda x: x.title(),
            help="Pearson: linear relationships | Spearman: monotonic relationships | Kendall: robust to outliers",
            key="corr_method",
        )
    with col2:
        cluster = cast(
            bool,
            st.toggle(
                "Cluster Variables",
                value=n_reduced < 100,
                help="Reorder by hierarchical clustering",
                key="corr_cluster",
            ),
        )
    # Check if subsampling will occur
    df_test = pd.DataFrame(reduced_data).dropna()
    if len(df_test) > 50000:
        st.info(
            f"📊 **Dataset subsampled:** Using 50,000 randomly selected cells out of {len(df_test):,} "
            "for performance. Correlations remain representative."
        )
    # Use matplotlib for large feature sets (> 200), plotly for smaller
    use_matplotlib = n_reduced > 200
    if use_matplotlib:
        st.info(
            f"📈 **Using Matplotlib rendering** for {n_reduced} features "
            "(Plotly would exceed Streamlit's message size limit). "
            "This produces a static image but handles large datasets efficiently."
        )
        fig = create_matplotlib_correlation_heatmap(reduced_data, method=method, cluster=cluster)
        st.pyplot(fig, use_container_width=True)
        # Close figure to free memory
        import matplotlib.pyplot as plt_cleanup
        plt_cleanup.close(fig)
    else:
        fig = create_full_correlation_heatmap(reduced_data, method=method, cluster=cluster)
        st.plotly_chart(fig, width="stretch")
    st.markdown(
        f"""
        **{method.title()} correlation** measures the {"linear" if method == "pearson" else "monotonic"}
        relationship between variables. Values range from -1 (perfect negative correlation) to
        +1 (perfect positive correlation), with 0 indicating no correlation.
        """
    )
@st.fragment
 def _render_pca_analysis(data_dict: dict[str, pd.Series]):
    """Render PCA biplot visualization.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series
    """
    st.subheader("Principal Component Analysis (PCA)")
    n_features = len(data_dict)
    # Reduce features if too many for visualization
    reduced_data = data_dict
    if n_features > 500:
        st.warning(
            f"⚠️ **Large dataset:** {n_features} features detected. "
            "Automatically selecting the 500 most variable features for PCA visualization."
        )
        reduced_data = select_top_variable_features(data_dict, n_features=500, method="variance")
    show_loadings = cast(
        bool,
        st.toggle(
            "Show Loading Vectors",
            value=True,
            help="Display how each variable contributes to the principal components",
            key="pca_loadings",
        ),
    )
    fig = create_pca_biplot(reduced_data, n_components=2, show_loadings=show_loadings)
    st.plotly_chart(fig, width="stretch")
    st.markdown(
        """
        **PCA** reduces the dimensionality of the data while preserving variance.
        - **Blue points**: Individual grid cells projected onto the first two principal components
        - **Red arrows**: Loading vectors showing how each variable contributes to the PCs
        - Variables pointing in similar directions are positively correlated
        - Longer arrows indicate variables with higher variance
        """
    )
@st.fragment
 def _render_hierarchical_clustering(data_dict: dict[str, pd.Series]):
    """Render hierarchical clustering dendrogram.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series
    """
    st.subheader("Hierarchical Clustering of Variables")
    n_features = len(data_dict)
    # Reduce features if too many
    reduced_data = data_dict
    if n_features > 300:
        st.warning(
            f"⚠️ **Large dataset:** {n_features} features detected. "
            "Automatically selecting the 300 most variable features for dendrogram visualization."
        )
        reduced_data = select_top_variable_features(data_dict, n_features=300, method="variance")
    method = st.selectbox(
        "Distance Metric (based on correlation)",
        options=["pearson", "spearman"],
        index=0,
        format_func=lambda x: x.title(),
        key="dendro_method",
    )
    fig = create_dendrogram_plot(reduced_data, method=method)
    st.plotly_chart(fig, width="stretch")
    st.markdown(
        """
        **Dendrogram** shows hierarchical relationships between variables based on correlation distance.
        - Variables that merge at lower heights are more similar (highly correlated)
        - Distinct clusters suggest groups of related features
        - Useful for identifying redundant features or feature groups
        """
    )
@st.fragment
 def _render_mutual_information(data_dict: dict[str, pd.Series]):
    """Render mutual information matrix.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series
    """
    st.subheader("Mutual Information Analysis")
    n_features = len(data_dict)
    # Mutual information is very computationally expensive - hard limit
    if n_features > 200:
        st.error(
            f"❌ **Too many features:** Mutual information analysis with {n_features} features "
            "would be too computationally expensive. Please reduce to ≤200 features using the "
            "variable selection or aggregation filters."
        )
        return
    st.warning(
        "⚠️ **Computationally intensive**: This analysis may take some time. "
        "Dataset is subsampled to 10,000 cells for performance."
    )
    fig = create_mutual_information_matrix(data_dict)
    st.plotly_chart(fig, width="stretch")
    st.markdown(
        """
        **Mutual Information** captures both linear and non-linear relationships between variables.
        - Unlike correlation, MI can detect complex dependencies
        - Higher values indicate stronger information sharing
        - Particularly useful for identifying non-linear relationships that correlation might miss
        """
    )
@st.fragment
 def _render_variance_analysis(data_dict: dict[str, pd.Series]):
    """Render feature variance and spread analysis.
    Args:
        data_dict: Dictionary mapping variable names to pandas Series
    """
    st.subheader("Feature Variance and Spread")
    fig = create_feature_variance_plot(data_dict)
    st.plotly_chart(fig, width="stretch")
    st.markdown(
        """
        **Variance metrics** show the spread and variability of each feature:
        - **Standard Deviation**: Average distance from the mean
        - **IQR** (Interquartile Range): Spread of the middle 50% of data (robust to outliers)
        - **Range**: Difference between max and min values
        - **CV** (Coefficient of Variation): Normalized variability (std/mean)
        Features with very low variance may provide little information for modeling.
        """
    )
 def _get_variable_groups(all_series: dict[str, pd.Series]) -> dict[str, list[str]]:
    """Group variables by data source.
    Args:
        all_series: Dictionary of all variable series
    Returns:
        Dictionary mapping source names to lists of variable names
    """
    var_groups = {}
    for var_name in all_series.keys():
        source = var_name.split("_")[0]
        if source not in var_groups:
            var_groups[source] = []
        var_groups[source].append(var_name)
    return var_groups
 def _render_variable_selection(all_series: dict[str, pd.Series]) -> dict[str, pd.Series]:
    """Render variable selection UI and return filtered data.
    Args:
        all_series: Dictionary of all variable series
    Returns:
        Filtered dictionary with only selected variables
    """
    var_groups = _get_variable_groups(all_series)
    selected_vars = []
    # Create checkboxes for each group
    for source, vars_in_group in sorted(var_groups.items()):
        with st.container():
            col1, col2 = st.columns([1, 4])
            with col1:
                select_all = st.checkbox(f"**{source}** ({len(vars_in_group)})", value=True, key=f"select_{source}")
            with col2:
                if select_all:
                    selected_vars.extend(vars_in_group)
                else:
                    # Show first few variables as examples
                    st.caption(", ".join(vars_in_group[:3]) + ("..." if len(vars_in_group) > 3 else ""))
    # Filter data dict
    filtered_data = {k: v for k, v in all_series.items() if k in selected_vars}
    if len(filtered_data) < 2:
        st.warning("⚠️ Please select at least 2 variables for correlation analysis")
        st.stop()
    st.info(f"**Selected {len(filtered_data)} variables** for analysis")
    return filtered_data
@st.fragment
 def render_correlations_tab(
    member_datasets: dict[L2SourceDataset, xr.Dataset],
    grid_area_series: pd.Series | None = None,
 ):
    """Render the cross-dataset correlation analysis tab.
    Args:
        member_datasets: Dictionary mapping member names to their xarray Datasets
        grid_area_series: Optional pandas Series with cell_ids as index and area values
    """
    st.markdown(
        """
        This section provides comprehensive analysis of relationships and similarities
        between all variables across different data sources (ArcticDEM, ERA5, AlphaEarth, etc.).
        """
    )
    # Flatten all datasets into a single dict of pandas Series
    with st.spinner("Preparing data for correlation analysis..."):
        # First, compute all datasets if needed
        computed_datasets = {}
        for member_name, ds in member_datasets.items():
            # Compute if lazy
            if any(isinstance(v.data, type(ds)) for v in ds.data_vars.values()):
                with st.spinner(f"Loading {member_name} data..."):
                    ds = ds.compute()
            computed_datasets[member_name] = ds
    # Render aggregation selection
    with st.expander("🔧 Aggregation Selection", expanded=False):
        dimension_filters = _render_aggregation_selection(computed_datasets)
        if dimension_filters:
            st.info(
                f"**Filtering applied:** {sum(len(dims) for dims in dimension_filters.values())} "
                f"dimension filter(s) across {len(dimension_filters)} data source(s)"
            )
    # Now flatten with filters applied
    with st.spinner("Flattening datasets..."):
        all_series = {}
        # Add grid area if provided
        if grid_area_series is not None:
            all_series["Grid_Cell_Area_km2"] = grid_area_series
        # Process each member dataset with its dimension filter
        for member_name, ds in computed_datasets.items():
            prefix = f"{member_name}_"
            member_filter = dimension_filters.get(member_name, None)
            member_series = _flatten_dataset_to_series(ds, prefix=prefix, dimension_filter=member_filter)
            all_series.update(member_series)
    st.success(f"✅ Loaded {len(all_series)} variables from {len(member_datasets)} data sources")
    # Variable selection
    with st.expander("🔧 Variable Selection", expanded=False):
        st.markdown("Select which variables to include in the correlation analysis.")
        filtered_data = _render_variable_selection(all_series)
    # Create tabs for different analyses
    analysis_tabs = st.tabs(
        [
            "📊 Correlation Matrix",
            "🔬 PCA Biplot",
            "🌳 Hierarchical Clustering",
            "📈 Variance Analysis",
            "🔗 Mutual Information",
        ]
    )
    with analysis_tabs[0]:
        _render_correlation_matrix(filtered_data)
    with analysis_tabs[1]:
        _render_pca_analysis(filtered_data)
    with analysis_tabs[2]:
        _render_hierarchical_clustering(filtered_data)
    with analysis_tabs[3]:
        _render_variance_analysis(filtered_data)
    with analysis_tabs[4]:
        _render_mutual_information(filtered_data)
--- a/src/entropice/dashboard/sections/experiment_feature_importance.py
+++ b/src/entropice/dashboard/sections/experiment_feature_importance.py
@ -0,0 +1,331 @@
 """Feature importance analysis section for experiment comparison."""
 import pandas as pd
 import streamlit as st
 from entropice.dashboard.plots.experiment_comparison import (
    create_data_source_importance_bars,
    create_feature_consistency_plot,
    create_feature_importance_by_grid_level,
 )
 from entropice.dashboard.utils.loaders import (
    AutogluonTrainingResult,
    TrainingResult,
 )
 def _extract_feature_importance_from_results(
    training_results: list[TrainingResult],
 ) -> pd.DataFrame:
    """Extract feature importance from all training results.
    Args:
        training_results: List of TrainingResult objects
    Returns:
        DataFrame with columns: feature, importance, model, grid, level, task, target
    """
    records = []
    for tr in training_results:
        # Load model state if available
        model_state = tr.load_model_state()
        if model_state is None:
            continue
        info = tr.display_info
        # Extract feature importance based on available data
        if "feature_importance" in model_state.data_vars:
            # eSPA or similar models with direct feature importance
            importance_data = model_state["feature_importance"]
            for feature_idx, feature_name in enumerate(importance_data.coords["feature"].values):
                importance_value = float(importance_data.isel(feature=feature_idx).values)
                records.append(
                    {
                        "feature": str(feature_name),
                        "importance": importance_value,
                        "model": info.model,
                        "grid": info.grid,
                        "level": info.level,
                        "task": info.task,
                        "target": info.target,
                    }
                )
        elif "gain" in model_state.data_vars:
            # XGBoost-style feature importance
            gain_data = model_state["gain"]
            for feature_idx, feature_name in enumerate(gain_data.coords["feature"].values):
                importance_value = float(gain_data.isel(feature=feature_idx).values)
                records.append(
                    {
                        "feature": str(feature_name),
                        "importance": importance_value,
                        "model": info.model,
                        "grid": info.grid,
                        "level": info.level,
                        "task": info.task,
                        "target": info.target,
                    }
                )
        elif "feature_importances_" in model_state.data_vars:
            # Random Forest style
            importance_data = model_state["feature_importances_"]
            for feature_idx, feature_name in enumerate(importance_data.coords["feature"].values):
                importance_value = float(importance_data.isel(feature=feature_idx).values)
                records.append(
                    {
                        "feature": str(feature_name),
                        "importance": importance_value,
                        "model": info.model,
                        "grid": info.grid,
                        "level": info.level,
                        "task": info.task,
                        "target": info.target,
                    }
                )
    return pd.DataFrame(records)
 def _extract_feature_importance_from_autogluon(
    autogluon_results: list[AutogluonTrainingResult],
 ) -> pd.DataFrame:
    """Extract feature importance from AutoGluon results.
    Args:
        autogluon_results: List of AutogluonTrainingResult objects
    Returns:
        DataFrame with columns: feature, importance, model, grid, level, task, target
    """
    records = []
    for ag in autogluon_results:
        if ag.feature_importance is None:
            continue
        info = ag.display_info
        # AutoGluon feature importance is already a DataFrame with features as index
        for feature_name, importance_value in ag.feature_importance["importance"].items():
            records.append(
                {
                    "feature": str(feature_name),
                    "importance": float(importance_value),
                    "model": "autogluon",
                    "grid": info.grid,
                    "level": info.level,
                    "task": info.task,
                    "target": info.target,
                }
            )
    return pd.DataFrame(records)
 def _categorize_feature(feature_name: str) -> str:
    """Categorize feature by data source."""
    feature_lower = feature_name.lower()
    if feature_lower.startswith("arcticdem"):
        return "ArcticDEM"
    if feature_lower.startswith("era5"):
        return "ERA5"
    if feature_lower.startswith("embeddings") or feature_lower.startswith("alphaearth"):
        return "Embeddings"
    return "General"
 def _prepare_feature_importance_data(
    training_results: list[TrainingResult],
    autogluon_results: list[AutogluonTrainingResult],
 ) -> pd.DataFrame | None:
    """Extract and prepare feature importance data.
    Args:
        training_results: List of RandomSearchCV training results
        autogluon_results: List of AutoGluon training results
    Returns:
        DataFrame with feature importance data or None if no data available
    """
    fi_df_cv = _extract_feature_importance_from_results(training_results)
    fi_df_ag = _extract_feature_importance_from_autogluon(autogluon_results)
    if fi_df_cv.empty and fi_df_ag.empty:
        return None
    # Combine both
    fi_df = pd.concat([fi_df_cv, fi_df_ag], ignore_index=True)
    # Add data source categorization
    fi_df["data_source"] = fi_df["feature"].apply(_categorize_feature)
    fi_df["grid_level"] = fi_df["grid"] + "_" + fi_df["level"].astype(str)
    return fi_df
@st.fragment
 def render_feature_importance_analysis(
    training_results: list[TrainingResult],
    autogluon_results: list[AutogluonTrainingResult],
 ):
    """Render feature importance analysis section.
    Args:
        training_results: List of RandomSearchCV training results
        autogluon_results: List of AutoGluon training results
    """
    st.header("🔍 Feature Importance Analysis")
    st.markdown(
        """
        This section analyzes which features are most important across different
        models, grid levels, tasks, and targets.
        """
    )
    # Extract feature importance
    with st.spinner("Extracting feature importance from training results..."):
        fi_df = _prepare_feature_importance_data(training_results, autogluon_results)
    if fi_df is None:
        st.warning("No feature importance data available. Model state files may be missing.")
        return
    st.success(f"Extracted feature importance from {len(fi_df)} feature-model combinations")
    # Filters
    st.subheader("Filters")
    col1, col2, col3 = st.columns(3)
    with col1:
        # Task filter
        available_tasks = ["All", *sorted(fi_df["task"].unique().tolist())]
        selected_task = st.selectbox("Task", options=available_tasks, index=0, key="fi_task_filter")
    with col2:
        # Target filter
        available_targets = ["All", *sorted(fi_df["target"].unique().tolist())]
        selected_target = st.selectbox("Target Dataset", options=available_targets, index=0, key="fi_target_filter")
    with col3:
        # Top N features
        top_n_features = st.number_input("Top N Features", min_value=5, max_value=50, value=15, key="top_n_features")
    # Apply filters
    filtered_fi_df = fi_df.copy()
    if selected_task != "All":
        filtered_fi_df = filtered_fi_df.loc[filtered_fi_df["task"] == selected_task]
    if selected_target != "All":
        filtered_fi_df = filtered_fi_df.loc[filtered_fi_df["target"] == selected_target]
    if len(filtered_fi_df) == 0:
        st.warning("No feature importance data available for the selected filters.")
        return
    # Section 1: Top features by grid level
    st.subheader("Top Features by Grid Level")
    try:
        fig = create_feature_importance_by_grid_level(filtered_fi_df, top_n=top_n_features)
        st.plotly_chart(fig, width="stretch")
    except Exception as e:
        st.error(f"Could not create feature importance by grid level plot: {e}")
    # Show detailed breakdown in expander
    grid_levels = sorted(filtered_fi_df["grid_level"].unique())
    with st.expander("Show Detailed Breakdown by Grid Level", expanded=False):
        for grid_level in grid_levels:
            grid_data = filtered_fi_df[filtered_fi_df["grid_level"] == grid_level]
            # Get top features for this grid level
            top_features_grid = (
                grid_data.groupby("feature")["importance"].mean().reset_index().nlargest(top_n_features, "importance")
            )
            st.markdown(f"**{grid_level.replace('_', '-').title()}**")
            # Create display dataframe with data source
            display_df = top_features_grid.merge(
                grid_data[["feature", "data_source"]].drop_duplicates(), on="feature", how="left"
            )
            display_df.columns = ["Feature", "Mean Importance", "Data Source"]
            display_df = display_df.sort_values("Mean Importance", ascending=False)
            st.dataframe(display_df, width="stretch", hide_index=True)
    # Section 2: Feature importance consistency across models
    st.subheader("Feature Importance Consistency Across Models")
    st.markdown(
        """
        **Coefficient of Variation (CV)**: Lower values indicate more consistent importance across models.
        High CV suggests the feature's importance varies significantly between different models.
        """
    )
    try:
        fig = create_feature_consistency_plot(filtered_fi_df, top_n=top_n_features)
        st.plotly_chart(fig, width="stretch")
    except Exception as e:
        st.error(f"Could not create feature consistency plot: {e}")
    # Show detailed statistics in expander
    with st.expander("Show Detailed Statistics", expanded=False):
        # Get top features overall
        overall_top_features = (
            filtered_fi_df.groupby("feature")["importance"]
            .mean()
            .reset_index()
            .nlargest(top_n_features, "importance")["feature"]
            .tolist()
        )
        # Calculate variance in importance across models for each feature
        feature_variance = (
            filtered_fi_df[filtered_fi_df["feature"].isin(overall_top_features)]
            .groupby("feature")["importance"]
            .agg(["mean", "std", "min", "max"])
            .reset_index()
        )
        feature_variance["coefficient_of_variation"] = feature_variance["std"] / feature_variance["mean"]
        feature_variance = feature_variance.sort_values("mean", ascending=False)
        # Add data source
        feature_variance = feature_variance.merge(
            filtered_fi_df[["feature", "data_source"]].drop_duplicates(), on="feature", how="left"
        )
        feature_variance.columns = ["Feature", "Mean", "Std Dev", "Min", "Max", "CV", "Data Source"]
        st.dataframe(
            feature_variance[["Feature", "Data Source", "Mean", "Std Dev", "CV"]],
            width="stretch",
            hide_index=True,
        )
    # Section 3: Feature importance by data source
    st.subheader("Feature Importance by Data Source")
    try:
        fig = create_data_source_importance_bars(filtered_fi_df)
        st.plotly_chart(fig, width="stretch")
    except Exception as e:
        st.error(f"Could not create data source importance chart: {e}")
    # Show detailed table in expander
    with st.expander("Show Data Source Statistics", expanded=False):
        # Aggregate importance by data source
        source_importance = (
            filtered_fi_df.groupby("data_source")["importance"].agg(["sum", "mean", "count"]).reset_index()
        )
        source_importance.columns = ["Data Source", "Total Importance", "Mean Importance", "Feature Count"]
        source_importance = source_importance.sort_values("Total Importance", ascending=False)
        st.dataframe(source_importance, width="stretch", hide_index=True)
--- a/src/entropice/dashboard/sections/experiment_grid_analysis.py
+++ b/src/entropice/dashboard/sections/experiment_grid_analysis.py
@ -0,0 +1,75 @@
 """Grid-level analysis section for experiment comparison."""
 import pandas as pd
 import streamlit as st
 from entropice.dashboard.plots.experiment_comparison import create_grid_level_comparison_plot
 from entropice.dashboard.utils.formatters import format_metric_name
@st.fragment
 def render_grid_level_analysis(summary_df: pd.DataFrame, available_metrics: list[str]):
    """Render grid-level analysis section.
    Args:
        summary_df: Summary DataFrame with all results
        available_metrics: List of available metrics
    """
    st.header("📐 Grid Level Analysis")
    st.markdown(
        """
        This section analyzes how different grid levels affect model performance.
        Compare performance across grid types (hex vs healpix) and resolution levels.
        Metrics are automatically selected based on the task type.
        """
    )
    # Determine metrics to use per task
    task_metric_map = {
        "binary": "f1",
        "count_regimes": "f1_weighted",
        "density_regimes": "f1_weighted",
        "count": "r2",
        "density": "r2",
    }
    # Get unique tasks in the data
    unique_tasks = summary_df["task"].unique()
    # Split selection
    split = st.selectbox("Data Split", options=["test", "train", "combined"], index=0, key="grid_split")
    # Create plots for each task
    for task in sorted(unique_tasks):
        # Determine the metric for this task
        metric = task_metric_map.get(task, available_metrics[0])
        # Check if metric is available
        metric_col = f"{split}_{metric}"
        if metric_col not in summary_df.columns:
            # Fall back to first available metric
            metric = available_metrics[0]
            metric_col = f"{split}_{metric}"
        st.subheader(f"{task.replace('_', ' ').title()} - {format_metric_name(metric)}")
        # Filter data for this task
        task_df = summary_df[summary_df["task"] == task]
        # Create grid-level comparison plot
        try:
            fig = create_grid_level_comparison_plot(task_df, metric, split)
            st.plotly_chart(fig, width="stretch")
        except Exception as e:
            st.error(f"Could not create grid-level comparison plot for {task}: {e}")
        # Show statistics by grid level for this task
        if metric_col in task_df.columns:
            stats = (
                task_df.groupby(["grid", "level"])[metric_col].agg(["mean", "std", "min", "max", "count"]).reset_index()
            )
            stats.columns = ["Grid", "Level", "Mean", "Std Dev", "Min", "Max", "Count"]
            with st.expander(f"Show {format_metric_name(metric)} Statistics by Grid Level", expanded=False):
                st.dataframe(stats.sort_values("Mean", ascending=False), width="stretch", hide_index=True)
--- a/src/entropice/dashboard/sections/experiment_inference_maps.py
+++ b/src/entropice/dashboard/sections/experiment_inference_maps.py
@ -0,0 +1,193 @@
 """Section for visualizing experiment inference maps."""
 import streamlit as st
 from entropice.dashboard.plots.experiment_comparison import create_inference_maps
 from entropice.dashboard.utils.loaders import TrainingResult
 from entropice.utils.types import GridConfig
@st.fragment
 def render_inference_maps_section(
    experiment_name: str,
    training_results: list[TrainingResult],
 ) -> None:
    """Render the inference maps section.
    Args:
        experiment_name: Name of the experiment
        training_results: List of training results for the experiment
    """
    st.header("🗺️ Inference Maps Analysis")
    st.markdown(
        """
        Visualize the mean and uncertainty (standard deviation) of model predictions across the Arctic region.
        The maps show the spatial distribution of predictions aggregated across multiple training runs.
        """
    )
    if not training_results:
        st.info("No training results available for inference maps.")
        return
    # Extract unique grid configurations from training results
    available_grid_configs = sorted(
        {GridConfig.from_grid_level((tr.settings.grid, tr.settings.level)) for tr in training_results},
        key=lambda gc: gc.sort_key,
    )
    available_tasks = sorted({tr.settings.task for tr in training_results})
    available_targets = sorted({tr.settings.target for tr in training_results})
    available_models = sorted({tr.settings.model for tr in training_results})
    # Create form for selecting parameters
    with st.form("inference_map_form"):
        st.subheader("Map Configuration")
        col1, col2 = st.columns(2)
        with col1:
            selected_grid_config = st.selectbox(
                "Grid Configuration",
                options=available_grid_configs,
                format_func=lambda gc: gc.display_name,
                help="Select the grid type and resolution level for the inference map",
            )
            selected_task = st.selectbox(
                "Task",
                options=available_tasks,
                help="Select the prediction task",
            )
        st.subheader("Filters")
        col3, col4 = st.columns(2)
        with col3:
            selected_targets = st.multiselect(
                "Target Datasets",
                options=available_targets,
                default=available_targets,
                help="Filter by target datasets (select all to include all)",
            )
        with col4:
            selected_models = st.multiselect(
                "Model Types",
                options=available_models,
                default=available_models,
                help="Filter by model types (select all to include all)",
            )
        submit_button = st.form_submit_button("Generate Maps", type="primary")
    if submit_button:
        # Extract grid and level from selected config
        selected_grid = selected_grid_config.grid
        selected_level = selected_grid_config.level
        # Filter training results based on selections
        filtered_results = [
            tr
            for tr in training_results
            if tr.settings.grid == selected_grid
            and tr.settings.level == selected_level
            and tr.settings.task == selected_task
            and tr.settings.target in selected_targets
            and tr.settings.model in selected_models
        ]
        if not filtered_results:
            st.warning("No training results match the selected criteria. Please adjust your selections and try again.")
            return
        st.success(f"Found {len(filtered_results)} training runs matching the criteria.")
        # Display metadata about selected runs
        with st.expander("View Selected Training Runs"):
            run_info = []
            for tr in filtered_results:
                info = tr.display_info
                run_info.append(
                    {
                        "Task": info.task,
                        "Target": info.target,
                        "Model": info.model,
                        "Grid": f"{info.grid}_{info.level}",
                        "Created": info.timestamp.strftime("%Y-%m-%d %H:%M"),
                    }
                )
            st.dataframe(run_info, width="stretch")
        # Calculate inference maps
        with st.spinner("Calculating inference maps from predictions..."):
            try:
                from entropice.dashboard.utils.loaders import TrainingResult
                inference_gdf = TrainingResult.calculate_inference_maps(filtered_results)
                st.info(
                    f"Generated inference map with {len(inference_gdf):,} grid cells. "
                    + (
                        "Using point-based rendering (>50k cells)."
                        if len(inference_gdf) > 50000
                        else "Using polygon-based rendering."
                    )
                )
            except AssertionError as e:
                st.error(f"Error calculating inference maps: {e}")
                return
            except Exception as e:
                st.error(f"Unexpected error calculating inference maps: {e}")
                st.exception(e)
                return
        # Generate maps
        with st.spinner("Generating cartographic visualizations..."):
            try:
                deck_mean, deck_std = create_inference_maps(
                    inference_gdf,
                    grid=selected_grid,
                    level=selected_level,
                    task=selected_task,
                )
                # Display maps
                st.subheader("📍 Mean Prediction Map")
                st.pydeck_chart(deck_mean, use_container_width=True)
                st.subheader("📍 Uncertainty Map (Standard Deviation)")
                st.pydeck_chart(deck_std, use_container_width=True)
                # Display statistics
                st.subheader("📊 Inference Statistics")
                col1, col2, col3, col4 = st.columns(4)
                with col1:
                    st.metric("Grid Cells", f"{len(inference_gdf):,}")
                with col2:
                    st.metric(
                        "Mean Prediction",
                        f"{inference_gdf['mean_prediction'].mean():.4f}",
                    )
                with col3:
                    st.metric(
                        "Avg Uncertainty",
                        f"{inference_gdf['std_prediction'].mean():.4f}",
                    )
                with col4:
                    st.metric(
                        "Max Uncertainty",
                        f"{inference_gdf['std_prediction'].max():.4f}",
                    )
            except Exception as e:
                st.error(f"Error generating maps: {e}")
                st.exception(e)
                return
--- a/src/entropice/dashboard/sections/experiment_model_comparison.py
+++ b/src/entropice/dashboard/sections/experiment_model_comparison.py
@ -0,0 +1,77 @@
 """Model comparison section for experiment analysis."""
 import pandas as pd
 import streamlit as st
 from entropice.dashboard.plots.experiment_comparison import create_top_models_bar_chart
 from entropice.dashboard.utils.formatters import format_metric_name
@st.fragment
 def render_model_comparison(summary_df: pd.DataFrame, available_metrics: list[str]):
    """Render model comparison section.
    Args:
        summary_df: Summary DataFrame with all results
        available_metrics: List of available metrics
    """
    st.header("🏆 Model Performance Comparison")
    st.markdown(
        """
        This section shows the best performing models for each task.
        """
    )
    # Determine metrics to use per task
    task_metric_map = {
        "binary": "f1",
        "count_regimes": "f1_weighted",
        "density_regimes": "f1_weighted",
        "count": "r2",
        "density": "r2",
    }
    # Split selection
    split = st.selectbox("Data Split", options=["test", "train", "combined"], index=0, key="model_split")
    # Get unique tasks
    unique_tasks = summary_df["task"].unique()
    # For each task, show the best models
    for task in sorted(unique_tasks):
        # Determine the metric for this task
        metric = task_metric_map.get(task, available_metrics[0])
        metric_col = f"{split}_{metric}"
        if metric_col not in summary_df.columns:
            # Fall back to first available metric
            metric = available_metrics[0]
            metric_col = f"{split}_{metric}"
        st.subheader(f"{task.replace('_', ' ').title()}")
        # Filter data for this task
        task_df = summary_df[summary_df["task"] == task].copy()
        # Create visualization
        try:
            fig = create_top_models_bar_chart(task_df, metric, task, split, top_n=10)
            st.plotly_chart(fig, width="stretch")
        except Exception as e:
            st.error(f"Could not create model comparison chart for {task}: {e}")
        # Show table in expander
        with st.expander("Show Top 10 Models Table", expanded=False):
            top_models = task_df.nlargest(10, metric_col)[
                ["model", "grid", "level", "target", metric_col, "method"]
            ].copy()
            top_models.columns = ["Model", "Grid", "Level", "Target", format_metric_name(metric), "Method"]
            st.dataframe(
                top_models.reset_index(drop=True),
                width="stretch",
                hide_index=True,
            )
        st.divider()
--- a/src/entropice/dashboard/sections/experiment_overview.py
+++ b/src/entropice/dashboard/sections/experiment_overview.py
@ -0,0 +1,92 @@
 """Experiment overview and sidebar sections."""
 import pandas as pd
 import streamlit as st
 from entropice.dashboard.utils.loaders import (
    AutogluonTrainingResult,
    TrainingResult,
    get_available_experiments,
 )
 def render_experiment_sidebar() -> str | None:
    """Render sidebar for experiment selection.
    Returns:
        Selected experiment name or None
    """
    st.sidebar.header("🔬 Experiment Selection")
    experiments = get_available_experiments()
    if not experiments:
        st.sidebar.warning("No experiments found. Create an experiment directory with training results first.")
        return None
    selected_experiment = st.sidebar.selectbox(
        "Select Experiment",
        options=experiments,
        index=0,
        help="Choose an experiment to analyze",
    )
    return selected_experiment
 def render_experiment_overview(
    experiment_name: str,
    training_results: list[TrainingResult],
    autogluon_results: list[AutogluonTrainingResult],
    summary_df: pd.DataFrame,
 ):
    """Render experiment overview section.
    Args:
        experiment_name: Name of the experiment
        training_results: List of RandomSearchCV training results
        autogluon_results: List of AutoGluon training results
        summary_df: Summary DataFrame with all results
    """
    st.header(f"📊 Experiment: {experiment_name}")
    # Show summary statistics
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("Total Training Runs", len(training_results) + len(autogluon_results))
    with col2:
        st.metric("RandomSearchCV Runs", len(training_results))
    with col3:
        st.metric("AutoGluon Runs", len(autogluon_results))
    with col4:
        unique_configs = summary_df[["grid", "level", "task", "target"]].drop_duplicates()
        st.metric("Unique Configurations", len(unique_configs))
    st.divider()
    # Show summary table
    st.subheader("Experiment Summary")
    display_columns = [
        "method",
        "task",
        "target",
        "model",
        "grid_level",
        "test_score",
        "best_metric",
        "n_trials",
    ]
    available_columns = [col for col in display_columns if col in summary_df.columns]
    st.dataframe(
        summary_df[available_columns].sort_values("test_score", ascending=False),
        width="stretch",
        hide_index=True,
    )
--- a/src/entropice/dashboard/utils/init.py
+++ b/src/entropice/dashboard/utils/init.py
--- a/src/entropice/dashboard/utils/loaders.py
+++ b/src/entropice/dashboard/utils/loaders.py
@ -8,6 +8,7 @@ from datetime import datetime
 from pathlib import Path
 import antimeridian
 import geopandas as gpd
 import pandas as pd
 import streamlit as st
 import toml
@ -19,7 +20,7 @@ import entropice.utils.paths
 from entropice.dashboard.utils.formatters import TrainingResultDisplayInfo
 from entropice.ml.autogluon_training import AutoGluonTrainingSettings
 from entropice.ml.dataset import DatasetEnsemble, TrainingSet
-from entropice.ml.training import TrainingSettings
+from entropice.ml.randomsearch import TrainingSettings
 from entropice.utils.types import GridConfig, TargetDataset, Task, all_target_datasets, all_tasks
@ -215,6 +216,37 @@ class TrainingResult:
            records.append(record)
        return pd.DataFrame.from_records(records)
    @staticmethod
    def calculate_inference_maps(training_results: list["TrainingResult"]) -> gpd.GeoDataFrame:
        """Calculate the mean and standard deviation of inference maps across multiple training results."""
        assert len({tr.settings.grid for tr in training_results}) == 1, "All training results must have the same grid"
        assert len({tr.settings.level for tr in training_results}) == 1, "All training results must have the same level"
        grid = training_results[0].settings.grid
        level = training_results[0].settings.level
        gridfile = entropice.utils.paths.get_grid_file(grid, level)
        cells = gpd.read_parquet(gridfile, columns=["cell_id", "geometry"])
        if grid == "hex":
            cells["cell_id"] = cells["cell_id"].apply(lambda x: int(x, 16))
        cells = cells.set_index("cell_id")
        vals = []
        for tr in training_results:
            preds_file = tr.path / "predicted_probabilities.parquet"
            if not preds_file.exists():
                continue
            preds = pd.read_parquet(preds_file, columns=["cell_id", "predicted"]).set_index("cell_id")
            if preds["predicted"].dtype == "category":
                preds["predicted"] = preds["predicted"].cat.codes
            vals.append(preds)
        all_preds = pd.concat(vals, axis=1)
        mean_preds = all_preds.mean(axis=1)
        std_preds = all_preds.std(axis=1)
        cells["mean_prediction"] = mean_preds
        cells["std_prediction"] = std_preds
        return cells.reset_index()
@st.cache_data(ttl=300)  # Cache for 5 minutes
 def load_all_training_results() -> list[TrainingResult]:
@ -410,6 +442,185 @@ def load_training_sets(ensemble: DatasetEnsemble) -> dict[TargetDataset, dict[Ta
    return train_data_dict
 def get_available_experiments() -> list[str]:
    """Get list of available experiment names from the results directory.
    Returns:
        List of experiment directory names
    """
    results_dir = entropice.utils.paths.RESULTS_DIR
    experiments = []
    for item in results_dir.iterdir():
        if not item.is_dir():
            continue
        # Check if this directory contains training results (has subdirs with training results files)
        has_training_results = False
        for subitem in item.iterdir():
            if not subitem.is_dir():
                continue
            # Check for either RandomSearchCV or AutoGluon result files
            if (subitem / "search_results.parquet").exists() or (subitem / "leaderboard.parquet").exists():
                has_training_results = True
                break
        if has_training_results:
            experiments.append(item.name)
    return sorted(experiments)
 def load_experiment_training_results(experiment_name: str) -> list[TrainingResult]:
    """Load all training results for a specific experiment.
    Args:
        experiment_name: Name of the experiment directory
    Returns:
        List of TrainingResult objects for the experiment
    """
    experiment_dir = entropice.utils.paths.RESULTS_DIR / experiment_name
    if not experiment_dir.exists():
        return []
    training_results: list[TrainingResult] = []
    for result_path in experiment_dir.iterdir():
        if not result_path.is_dir():
            continue
        # Skip AutoGluon results
        if "autogluon" in result_path.name.lower():
            continue
        try:
            training_result = TrainingResult.from_path(result_path, experiment_name)
            training_results.append(training_result)
        except FileNotFoundError:
            pass  # Skip incomplete results
    # Sort by creation time (most recent first)
    training_results.sort(key=lambda tr: tr.created_at, reverse=True)
    return training_results
 def load_experiment_autogluon_results(experiment_name: str) -> list[AutogluonTrainingResult]:
    """Load all AutoGluon training results for a specific experiment.
    Args:
        experiment_name: Name of the experiment directory
    Returns:
        List of AutogluonTrainingResult objects for the experiment
    """
    experiment_dir = entropice.utils.paths.RESULTS_DIR / experiment_name
    if not experiment_dir.exists():
        return []
    training_results: list[AutogluonTrainingResult] = []
    for result_path in experiment_dir.iterdir():
        if not result_path.is_dir():
            continue
        # Only include AutoGluon results
        if "autogluon" not in result_path.name.lower():
            continue
        try:
            training_result = AutogluonTrainingResult.from_path(result_path, experiment_name)
            training_results.append(training_result)
        except FileNotFoundError:
            pass  # Skip incomplete results
    # Sort by creation time (most recent first)
    training_results.sort(key=lambda tr: tr.created_at, reverse=True)
    return training_results
 def create_experiment_summary_df(
    training_results: list[TrainingResult], autogluon_results: list[AutogluonTrainingResult]
 ) -> pd.DataFrame:
    """Create a summary DataFrame for all results in an experiment.
    Args:
        training_results: List of TrainingResult objects
        autogluon_results: List of AutogluonTrainingResult objects
    Returns:
        DataFrame with summary statistics for the experiment
    """
    records = []
    # Add RandomSearchCV results
    for tr in training_results:
        info = tr.display_info
        best_metric_name = tr._get_best_metric_name()
        record = {
            "method": "RandomSearchCV",
            "task": info.task,
            "target": info.target,
            "model": info.model,
            "grid": info.grid,
            "level": info.level,
            "grid_level": f"{info.grid}_{info.level}",
            "train_score": tr.train_metrics.get(best_metric_name, float("nan")),
            "test_score": tr.test_metrics.get(best_metric_name, float("nan")),
            "combined_score": tr.combined_metrics.get(best_metric_name, float("nan")),
            "best_metric": best_metric_name,
            "n_trials": len(tr.results),
            "created_at": tr.created_at,
            "path": tr.path,
        }
        # Add all train metrics
        for metric, value in tr.train_metrics.items():
            record[f"train_{metric}"] = value
        # Add all test metrics
        for metric, value in tr.test_metrics.items():
            record[f"test_{metric}"] = value
        # Add all combined metrics
        for metric, value in tr.combined_metrics.items():
            record[f"combined_{metric}"] = value
        records.append(record)
    # Add AutoGluon results
    for ag in autogluon_results:
        info = ag.display_info
        best_metric_name = ag._get_best_metric_name()
        record = {
            "method": "AutoGluon",
            "task": info.task,
            "target": info.target,
            "model": "ensemble",  # AutoGluon is an ensemble
            "grid": info.grid,
            "level": info.level,
            "grid_level": f"{info.grid}_{info.level}",
            "train_score": float("nan"),  # AutoGluon doesn't separate train scores
            "test_score": ag.test_metrics.get(best_metric_name, float("nan")),
            "combined_score": float("nan"),
            "best_metric": best_metric_name,
            "n_trials": len(ag.leaderboard),
            "created_at": ag.created_at,
            "path": ag.path,
        }
        # Add test metrics
        for metric, value in ag.test_metrics.items():
            if isinstance(value, (int, float)):
                record[f"test_{metric}"] = value
        records.append(record)
    return pd.DataFrame(records)
@dataclass
 class StorageInfo:
    """Storage information for a directory."""
--- a/src/entropice/dashboard/views/init.py
+++ b/src/entropice/dashboard/views/init.py
--- a/src/entropice/dashboard/views/dataset_page.py
+++ b/src/entropice/dashboard/views/dataset_page.py
@ -138,6 +138,7 @@ def render_dataset_page():
    )
    if era5_members:
        tab_names.append("🌡️ ERA5")
    tab_names += ["🔗 Correlations"]
    tabs = st.tabs(tab_names)
    with tabs[0]:
@ -168,6 +169,12 @@ def render_dataset_page():
            era5_member_dataset = {m: member_datasets[m] for m in era5_members}
            era5_member_stats = {m: stats.members[m] for m in era5_members}
            render_era5_tab(era5_member_dataset, grid_gdf, era5_member_stats)
        tab_index += 1
    with tabs[tab_index]:
        st.header("🔗 Cross-Dataset Correlation Analysis")
        # Extract grid area series
        #     grid_area_series = grid_gdf.set_index("cell_id")["area_km2"] if "area_km2" in grid_gdf.columns else None
        #    render_correlations_tab(member_datasets, grid_area_series=grid_area_series)
    st.balloons()
    stopwatch.summary()
--- a/src/entropice/dashboard/views/experiment_analysis_page.py
+++ b/src/entropice/dashboard/views/experiment_analysis_page.py
@ -0,0 +1,87 @@
 """Experiment Analysis page: Compare multiple training runs within an experiment."""
 import streamlit as st
 from entropice.dashboard.sections.experiment_feature_importance import render_feature_importance_analysis
 from entropice.dashboard.sections.experiment_grid_analysis import render_grid_level_analysis
 from entropice.dashboard.sections.experiment_inference_maps import render_inference_maps_section
 from entropice.dashboard.sections.experiment_model_comparison import render_model_comparison
 from entropice.dashboard.sections.experiment_overview import (
    render_experiment_overview,
    render_experiment_sidebar,
 )
 from entropice.dashboard.utils.loaders import (
    create_experiment_summary_df,
    load_experiment_autogluon_results,
    load_experiment_training_results,
 )
 def render_experiment_analysis_page():
    """Render the Experiment Analysis page of the dashboard."""
    st.title("🔬 Experiment Analysis")
    st.markdown(
        """
        Analyze and compare multiple training runs within an experiment.
        Select an experiment from the sidebar to explore:
        - How grid levels affect performance
        - Best models across different tasks and targets
        - Feature importance patterns
        """
    )
    # Experiment selection
    selected_experiment = render_experiment_sidebar()
    if selected_experiment is None:
        st.info("👈 Select an experiment from the sidebar to begin analysis.")
        st.stop()
    assert selected_experiment is not None, "Experiment must be selected"
    # Load experiment results
    with st.spinner(f"Loading results for experiment: {selected_experiment}..."):
        training_results = load_experiment_training_results(selected_experiment)
        autogluon_results = load_experiment_autogluon_results(selected_experiment)
    if not training_results and not autogluon_results:
        st.warning(f"No training results found in experiment: {selected_experiment}")
        st.stop()
    # Create summary DataFrame
    summary_df = create_experiment_summary_df(training_results, autogluon_results)
    # Get available metrics
    metric_columns = [col for col in summary_df.columns if col.startswith("test_")]
    available_metrics = [col.replace("test_", "") for col in metric_columns]
    if not available_metrics:
        st.error("No metrics found in the experiment results.")
        st.stop()
    # Render analysis sections
    render_experiment_overview(selected_experiment, training_results, autogluon_results, summary_df)
    st.divider()
    render_grid_level_analysis(summary_df, available_metrics)
    st.divider()
    render_model_comparison(summary_df, available_metrics)
    st.divider()
    render_feature_importance_analysis(training_results, autogluon_results)
    st.divider()
    render_inference_maps_section(selected_experiment, training_results)
    st.divider()
    # Raw data section
    st.header("📄 Raw Experiment Data")
    with st.expander("View Complete Summary DataFrame"):
        st.dataframe(summary_df, width="stretch")
--- a/src/entropice/dashboard/views/inference_page.py
+++ b/src/entropice/dashboard/views/inference_page.py
@ -85,7 +85,7 @@ def render_inference_statistics_section(predictions_gdf: gpd.GeoDataFrame, task:
    Args:
        predictions_gdf: GeoDataFrame with predictions.
-        task: Task type ('binary', 'count', 'density').
+        task: Task type ('binary', 'count_regimes', 'density_regimes', 'count', 'density').
    """
    st.header("📊 Inference Summary")
@ -125,7 +125,7 @@ def render_class_distribution_section(predictions_gdf: gpd.GeoDataFrame, task: s
    Args:
        predictions_gdf: GeoDataFrame with predictions.
-        task: Task type ('binary', 'count', 'density').
+        task: Task type ('binary', 'count_regimes', 'density_regimes', 'count', 'density').
    """
    st.header("📈 Class Distribution")
@ -138,7 +138,7 @@ def render_class_comparison_section(predictions_gdf: gpd.GeoDataFrame, task: str
    Args:
        predictions_gdf: GeoDataFrame with predictions.
-        task: Task type ('binary', 'count', 'density').
+        task: Task type ('binary', 'count_regimes', 'density_regimes', 'count', 'density').
    """
    st.header("🔍 Class Comparison Analysis")
--- a/src/entropice/experiments/feature_importance.py
+++ b/src/entropice/experiments/feature_importance.py
@ -0,0 +1,87 @@
 from typing import cast
 import cyclopts
 from stopuhr import stopwatch
 from entropice.ml.autogluon import RunSettings as AutoGluonRunSettings
 from entropice.ml.autogluon import train as train_autogluon
 from entropice.ml.dataset import DatasetEnsemble
 from entropice.ml.hpsearchcv import RunSettings as HPOCVRunSettings
 from entropice.ml.hpsearchcv import hpsearch_cv
 from entropice.utils.paths import RESULTS_DIR
 from entropice.utils.types import Grid, Model, TargetDataset, Task
 cli = cyclopts.App("entropice-feature-importance")
 EXPERIMENT_NAME = "feature_importance_era5-shoulder_arcticdem"
 # EXPERIMENT_NAME = "tobis-final-tests"
@cli.default
 def main(
    grid: Grid,
    target: TargetDataset,
 ):
    levels = [3, 4, 5, 6] if grid == "hex" else [6, 7, 8, 9, 10]
    levels = [3, 6] if grid == "hex" else [6, 10]
    for level in levels:
        print(f"Running feature importance experiment for {grid} grid at level {level}...")
        dimension_filters = {"ArcticDEM": {"aggregations": ["median"]}}
        if (grid == "hex" and level in [3, 4]) or (grid == "healpix" and level in [6, 7]):
            dimension_filters["ERA5-shoulder"] = {"aggregations": ["median"]}
        dataset_ensemble = DatasetEnsemble(
            grid=grid, level=level, members=["ArcticDEM", "ERA5-shoulder"], dimension_filters=dimension_filters
        )
        for task in cast(list[Task], ["binary", "density"]):
            print(f"\nRunning for {task}...")
            # AutoGluon
            time_limit = 30 * 60  # 30 minutes
            # time_limit = 60
            presets = "extreme"
            # presets = "medium"
            settings = AutoGluonRunSettings(
                time_limit=time_limit,
                presets=presets,
                verbosity=2,
                task=task,
                target=target,
            )
            train_autogluon(dataset_ensemble, settings, experiment=EXPERIMENT_NAME)
            # HPOCV
            splitter = "stratified_shuffle" if task == "binary" else "kfold"
            models: list[Model] = ["xgboost", "rf", "knn"]
            if task == "binary":
                models.append("espa")
            for model in models:
                print(f"\nRunning HPOCV for model {model}...")
                n_iter = {
                    "espa": 300,
                    "xgboost": 100,
                    "rf": 40,
                    "knn": 20,
                }[model]
                # n_iter = 3
                scaler = "standard" if model in ["espa", "knn"] else "none"
                normalize = scaler != "none"
                settings = HPOCVRunSettings(
                    n_iter=n_iter,
                    task=task,
                    target=target,
                    splitter=splitter,
                    model=model,
                    scaler=scaler,
                    normalize=normalize,
                )
                hpsearch_cv(dataset_ensemble, settings, experiment=EXPERIMENT_NAME)
    stopwatch.summary()
    times = stopwatch.export()
    times.to_parquet(RESULTS_DIR / EXPERIMENT_NAME / f"training_times_{target}_{grid}.parquet")
    print("Done.")
 if __name__ == "__main__":
    cli()
--- a/src/entropice/ingest/darts.py
+++ b/src/entropice/ingest/darts.py
@ -27,6 +27,7 @@ pretty.install()
 darts_v1_l2_file = DARTS_V1_DIR / "DARTS_NitzeEtAl_v1-2_features_2018-2023_level2.parquet"
 darts_v1_l2_cov_file = DARTS_V1_DIR / "DARTS_NitzeEtAl_v1-2_coverage_2018-2023_level2.parquet"
 darts_v1_corrections = DARTS_V1_DIR / "negative_correction.geojson"
 darts_ml_training_labels_repo = DARTS_MLLABELS_DIR / "ML_training_labels" / "retrogressive_thaw_slumps"
 cli = cyclopts.App(name="darts-rts")
@ -153,6 +154,22 @@ def extract_darts_v1(grid: Grid, level: int):
        darts_l2 = gpd.read_parquet(darts_v1_l2_file)
        darts_cov_l2 = gpd.read_parquet(darts_v1_l2_cov_file)
        grid_gdf, cell_areas = _load_grid(grid, level)
        corrections = gpd.read_file(darts_v1_corrections).to_crs(darts_l2.crs)
    with stopwatch("Apply corrections"):
        # The correction file is just an area of sure negatives
        # Thus, we first need to remove all RTS labels that intersect with the correction area,
        darts_l2 = gpd.overlay(darts_l2, corrections, how="difference")
        # then we need to add the correction area as coverage to the coverage file per year.
        darts_cov_l2_cor = []
        for year in darts_cov_l2["year"].unique():
            year_cov = darts_cov_l2[darts_cov_l2["year"] == year]
            year_cov_cor = gpd.overlay(year_cov, corrections, how="union")
            year_cov_cor["year"] = year
            darts_cov_l2_cor.append(year_cov_cor)
        darts_cov_l2_cor = gpd.GeoDataFrame(pd.concat(darts_cov_l2_cor, ignore_index=True))
        darts_cov_l2_cor["year"] = darts_cov_l2_cor["year"].astype(int)
        darts_cov_l2 = darts_cov_l2_cor
    with stopwatch("Assign RTS to grid"):
        grid_l2 = grid_gdf.overlay(darts_l2.to_crs(grid_gdf.crs), how="intersection")
--- a/src/entropice/ml/init.py
+++ b/src/entropice/ml/init.py
@ -7,6 +7,6 @@ This package contains modules for machine learning workflows:
 - inference: Batch prediction pipeline for trained classifiers
 """
-from . import dataset, inference, training
+from . import dataset, inference, randomsearch
-__all__ = ["dataset", "inference", "training"]
+__all__ = ["dataset", "inference", "randomsearch"]
--- a/src/entropice/ml/autogluon.py
+++ b/src/entropice/ml/autogluon.py
@ -0,0 +1,191 @@
 """Training of models with AutoGluon."""
 from dataclasses import dataclass
 from typing import cast
 import cyclopts
 import pandas as pd
 import shap.maskers
 import xarray as xr
 from autogluon.tabular import TabularDataset, TabularPredictor
 from rich import pretty, traceback
 from shap import Explainer, Explanation
 from sklearn import set_config
 from stopuhr import stopwatch
 from entropice.ml.dataset import DatasetEnsemble
 from entropice.ml.inference import predict_proba
 from entropice.utils.paths import get_training_results_dir
 from entropice.utils.training import AutoML, Training
 from entropice.utils.types import TargetDataset, Task
 traceback.install()
 pretty.install()
 cli = cyclopts.App(
    "entropice-autogluon",
    config=cyclopts.config.Toml("autogluon-config.toml", root_keys=["tool", "entropice-autogluon"]),  # ty:ignore[invalid-argument-type]
 )
@cyclopts.Parameter("*")
@dataclass(frozen=True, kw_only=True)
 class RunSettings:
    """Run settings for training."""
    time_limit: int = 3600  # Time limit in seconds (1 hour default)
    presets: str = "extreme"
    task: Task = "binary"
    target: TargetDataset = "darts_v1"
    verbosity: int = 2  # Verbosity level (0-4)
 def _compute_metrics_and_confusion_matrix(  # noqa: C901
    predictor: TabularPredictor, train_data: pd.DataFrame, test_data: pd.DataFrame, complete_data: pd.DataFrame
 ) -> tuple[pd.DataFrame, xr.Dataset | None]:
    train_scores = predictor.evaluate(train_data, display=True, detailed_report=True)
    test_scores = predictor.evaluate(test_data, display=True, detailed_report=True)
    complete_scores = predictor.evaluate(complete_data, display=True, detailed_report=True)
    m = []
    cm = {}
    for dataset, scores in zip(["train", "test", "complete"], [train_scores, test_scores, complete_scores]):
        for metric, score in scores.items():
            if metric == "confusion_matrix":
                score = cast(pd.DataFrame, score)
                confusion_matrix = xr.DataArray(
                    score.to_numpy(),
                    dims=("y_true", "y_pred"),
                    coords={"y_true": score.index.tolist(), "y_pred": score.columns.tolist()},
                )
                cm[dataset] = confusion_matrix
            elif metric == "classification_report":
                score = cast(dict[str, dict[str, float]], score)
                score.pop("accuracy")  # Accuracy is already included as a separate metric
                macro_avg = score.pop("macro avg")
                for macro_avg_metric, macro_avg_score in macro_avg.items():
                    metric_name = f"macro_avg_{macro_avg_metric}"
                    m.append({"dataset": dataset, "metric": metric_name, "score": macro_avg_score})
                weighted_avg = score.pop("weighted avg")
                for weighted_avg_metric, weighted_avg_score in weighted_avg.items():
                    metric_name = f"weighted_avg_{weighted_avg_metric}"
                    m.append({"dataset": dataset, "metric": metric_name, "score": weighted_avg_score})
                for class_name, class_scores in score.items():
                    class_name = class_name.replace(" ", "-")
                    for class_metric, class_score in class_scores.items():
                        m.append({"dataset": dataset, "metric": f"{class_name}_{class_metric}", "score": class_score})
            else:  # Scalar metric
                m.append({"dataset": dataset, "metric": metric, "score": score})
    if len(cm) == 0:
        return pd.DataFrame(m), None
    elif len(cm) == 3:
        return pd.DataFrame(m), xr.Dataset(cm)
    else:
        raise RuntimeError("Confusion matrix should be computed for all datasets or none.")
 def _compute_shap_explanation(
    predictor: TabularPredictor,
    train_data: pd.DataFrame,
    test_data: pd.DataFrame,
    feature_names: list[str],
    target_labels: list[str] | None,
 ) -> Explanation:
    masker = shap.maskers.Independent(data=train_data.drop(columns=["label"]))
    explainer = Explainer(
        predictor.predict_proba if predictor.problem_type in ["binary", "multiclass"] else predictor.predict,
        masker=masker,
        seed=42,
        feature_names=feature_names,
        output_names=target_labels,
    )
    samples = test_data.drop(columns=["label"])
    if len(samples) > 200:
        samples = samples.sample(n=200, random_state=42)
    explanation = explainer(samples)
    return explanation
@cli.default
 def train(
    dataset_ensemble: DatasetEnsemble,
    settings: RunSettings = RunSettings(),
    experiment: str | None = None,
 ) -> Training:
    """Perform random cross-validation on the training dataset.
    Args:
        dataset_ensemble (DatasetEnsemble): The dataset ensemble configuration.
        settings (RunSettings): This runs settings.
        experiment (str | None): Optional experiment name for results directory.
    """
    set_config(array_api_dispatch=False)
    results_dir = get_training_results_dir(
        experiment=experiment,
        name="autogluon",
        grid=dataset_ensemble.grid,
        level=dataset_ensemble.level,
        task=settings.task,
        target=settings.target,
    )
    print(f"\n💾 Results directory: {results_dir}")
    print("Creating training data...")
    training_data = dataset_ensemble.create_training_set(task=settings.task, target=settings.target)
    # Convert to AutoGluon TabularDataset
    train_data: pd.DataFrame = TabularDataset(training_data.to_dataframe("train"))  # ty:ignore[invalid-assignment]
    test_data: pd.DataFrame = TabularDataset(training_data.to_dataframe("test"))  # ty:ignore[invalid-assignment]
    complete_data: pd.DataFrame = TabularDataset(training_data.to_dataframe(None))  # ty:ignore[invalid-assignment]
    # Initialize TabularPredictor
    print(f"\n🚀 Initializing AutoGluon TabularPredictor (preset='{settings.presets}')...")
    predictor = TabularPredictor(
        label="label",
        path=str(results_dir / "models"),
        verbosity=settings.verbosity,
    )
    # Train models
    print(f"\n⚡ Training models for {settings.time_limit / 60}min...")
    with stopwatch("AutoGluon training"):
        predictor.fit(
            train_data=train_data,
            time_limit=settings.time_limit,
            presets=settings.presets,
            num_gpus=1,
        )
    print("\n📊 Evaluating model performance...")
    leaderboard = predictor.leaderboard(test_data)
    feature_importance = predictor.feature_importance(test_data)
    metrics, confusion_matrix = _compute_metrics_and_confusion_matrix(predictor, train_data, test_data, complete_data)
    with stopwatch("Explaining model predictions with SHAP..."):
        explanation = _compute_shap_explanation(
            predictor, train_data, test_data, training_data.feature_names, training_data.target_labels
        )
    print("Predicting probabilities for all cells...")
    preds = predict_proba(dataset_ensemble, model=predictor, task=settings.task)
    print(f"Predicted probabilities DataFrame with {len(preds)} entries.")
    summary = Training(
        path=results_dir,
        dataset=dataset_ensemble,
        method=AutoML(time_budget=settings.time_limit, preset=settings.presets, hpo=False),
        task=settings.task,
        target=settings.target,
        training_set=training_data,
        model=predictor,
        model_type="autogluon",
        metrics=metrics,
        feature_importance=feature_importance,
        shap_explanation=explanation,
        predictions=preds,
        confusion_matrix=confusion_matrix,
        cv_results=None,
        leaderboard=leaderboard,
    )
    summary.save()
    return summary
--- a/src/entropice/ml/autogluon_training.py
+++ b/src/entropice/ml/autogluon_training.py
@ -1,4 +1,4 @@
-"""Training with AutoGluon TabularPredictor for automated ML."""
+"""DePRECATED!!! Training with AutoGluon TabularPredictor for automated ML."""
 import pickle
 from dataclasses import asdict, dataclass
@ -12,7 +12,7 @@ from sklearn import set_config
 from stopuhr import stopwatch
 from entropice.ml.dataset import DatasetEnsemble
-from entropice.utils.paths import get_autogluon_results_dir
+from entropice.utils.paths import get_training_results_dir
 from entropice.utils.types import TargetDataset, Task
 traceback.install()
@ -101,11 +101,13 @@ def autogluon_train(
    print(f"📈 Evaluation metric: {eval_metric}")
    # Create results directory
-    results_dir = get_autogluon_results_dir(
+    results_dir = get_training_results_dir(
        experiment=experiment,
        grid=dataset_ensemble.grid,
        level=dataset_ensemble.level,
        task=settings.task,
        target=settings.target,
        name="autogluon",
    )
    print(f"\n💾 Results directory: {results_dir}")
--- a/src/entropice/ml/dataset.py
+++ b/src/entropice/ml/dataset.py
@ -18,7 +18,7 @@ from collections.abc import Generator
 from dataclasses import asdict, dataclass, field
 from functools import cache, cached_property
 from itertools import product
-from typing import Literal, cast
+from typing import Literal, TypeVar, cast
 import cupy as cp
 import cyclopts
@ -28,6 +28,7 @@ import pandas as pd
 import seaborn as sns
 import torch
 import xarray as xr
 from cuml import KMeans
 from rich import pretty, traceback
 from sklearn import set_config
 from sklearn.model_selection import train_test_split
@ -118,26 +119,29 @@ def bin_values(
    return binned
 ArrayType = TypeVar("ArrayType", torch.Tensor, np.ndarray, cp.ndarray)
@dataclass(frozen=True, eq=False)
-class SplittedArrays:
+class SplittedArrays[ArrayType: (torch.Tensor, np.ndarray, cp.ndarray)]:
    """Small wrapper for train and test arrays."""
-    train: torch.Tensor | np.ndarray | cp.ndarray
+    train: ArrayType
-    test: torch.Tensor | np.ndarray | cp.ndarray
+    test: ArrayType
    @cached_property
-    def combined(self) -> torch.Tensor | np.ndarray | cp.ndarray:
+    def combined(self) -> ArrayType:
        """Combined train and test arrays."""
        if isinstance(self.train, torch.Tensor) and isinstance(self.test, torch.Tensor):
-            return torch.cat([self.train, self.test], dim=0)
+            return torch.cat([self.train, self.test], dim=0)  # ty:ignore[invalid-return-type]
        elif isinstance(self.train, cp.ndarray) and isinstance(self.test, cp.ndarray):
            return cp.concatenate([self.train, self.test], axis=0)
        elif isinstance(self.train, np.ndarray) and isinstance(self.test, np.ndarray):
-            return np.concatenate([self.train, self.test], axis=0)
+            return np.concatenate([self.train, self.test], axis=0)  # ty:ignore[invalid-return-type]
        else:
            raise TypeError("Incompatible types for train and test arrays.")
-    def as_numpy(self) -> "SplittedArrays":
+    def as_numpy(self) -> "SplittedArrays[np.ndarray]":
        """Return the arrays as numpy arrays."""
        train_np = (
            self.train.cpu().numpy()
@ -157,13 +161,13 @@ class SplittedArrays:
@dataclass(frozen=True, eq=False)
-class TrainingSet:
+class TrainingSet[ArrayType: (torch.Tensor, np.ndarray, cp.ndarray)]:
    """Container for the training dataset."""
    targets: gpd.GeoDataFrame
    features: pd.DataFrame
-    X: SplittedArrays
+    X: SplittedArrays[ArrayType]
-    y: SplittedArrays
+    y: SplittedArrays[ArrayType]
    z: pd.Series
    split: pd.Series
@ -214,6 +218,16 @@ class TrainingSet:
        """Names of the features."""
        return self.features.columns.tolist()
    @cached_property
    def clusters(self) -> pd.Series:
        """Geo-Cluster assignments for each sample, based on the geometries of the target GeoDataFrame."""
        centroids = self.targets.to_crs("EPSG:3413")["geometry"].centroid
        centroids = cp.array([centroids.x.to_numpy(), centroids.y.to_numpy()]).T
        # Use kMeans to cluster the centroids into 10 clusters
        kmeans = KMeans(n_clusters=10, random_state=42)
        clusters = kmeans.fit_predict(centroids).get()
        return pd.Series(clusters, index=self.targets.index)
    def __len__(self):
        return len(self.z)
--- a/src/entropice/ml/explainations.py
+++ b/src/entropice/ml/explainations.py
--- a/src/entropice/ml/hpsearchcv.py
+++ b/src/entropice/ml/hpsearchcv.py
@ -0,0 +1,395 @@
 """Training of models with hyperparameter search with cross-validation."""
 from dataclasses import dataclass
 from typing import Literal, cast
 import cupy as cp
 import cyclopts
 import numpy as np
 import pandas as pd
 import shap.maskers
 import torch
 import xarray as xr
 from rich import pretty, traceback
 from shap import Explainer, Explanation, TreeExplainer
 from sklearn import set_config
 from sklearn.inspection import permutation_importance
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import (
    FunctionTransformer,
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    QuantileTransformer,
    StandardScaler,
 )
 from stopuhr import stopwatch
 from entropice.ml.dataset import DatasetEnsemble, SplittedArrays, TrainingSet
 from entropice.ml.inference import predict_proba
 from entropice.ml.models import (
    ModelHPOConfig,
    extract_espa_feature_importance,
    extract_rf_feature_importance,
    extract_xgboost_feature_importance,
    get_model_hpo_config,
    get_splitter,
 )
 from entropice.utils.metrics import get_metrics, metric_functions
 from entropice.utils.paths import get_training_results_dir
 from entropice.utils.training import HPOCV, Training, move_data_to_device
 from entropice.utils.types import HPSearch, Model, Scaler, Splitter, TargetDataset, Task
 traceback.install()
 pretty.install()
 cli = cyclopts.App(
    "entropice-hpsearchcv",
    config=cyclopts.config.Toml("hpsearchcv-config.toml", root_keys=["tool", "entropice-hpsearchcv"]),  # ty:ignore[invalid-argument-type]
 )
@cyclopts.Parameter("*")
@dataclass(frozen=True, kw_only=True)
 class RunSettings:
    """Run settings for training."""
    n_iter: int = 2000
    task: Task = "binary"
    target: TargetDataset = "darts_v1"
    search: HPSearch = "random"  # TODO: Implement grid and bayesian search
    splitter: Splitter = "stratified_shuffle"
    model: Model = "espa"
    scaler: Scaler = "standard"
    normalize: bool = False
    @property
    def device(self) -> Literal["torch", "cuda", "cpu"]:
        """Get the device to use for model training.
        Note: Device Management currently is super nasty:
            - CuML (RF & kNN) expects cupy ("cuda") arrays
            - XGBoost always returns CPU arrays
            - eSPA for some reason is super slow on cupy but very fast on torch tensors
            - The sklearn permutation stuff does not work with torch tensors, only cupy and numpy arrays
            - SHAP in general only works with CPU
        """
        return "torch" if self.model == "espa" else "cuda"
    def build_pipeline(self, model_hpo_config: ModelHPOConfig) -> Pipeline:  # noqa: C901
        """Build a scikit-learn Pipeline based on the settings."""
        # Add a feature scaler / normalization step if specified, but assert that it's only used for non-Tree models
        if self.model in ["rf", "xgboost"]:
            assert self.scaler == "none", f"Scaler {self.scaler} is not viable with model {self.model}"
        elif self.scaler == "none":
            assert self.scaler != "none", f"No scaler specified for model {self.model}, which is not viable."
        match self.scaler:
            case "standard":
                scaler = StandardScaler()
            case "minmax":
                scaler = MinMaxScaler()
            case "maxabs":
                scaler = MaxAbsScaler()
            case "quantile":
                scaler = QuantileTransformer(output_distribution="normal")
            case "none":
                scaler = None
            case _:
                raise ValueError(f"Unknown scaler: {self.scaler}")
        pipeline_steps = []
        if scaler is not None:
            print(f"Using {scaler.__class__.__name__} for feature scaling.")
            pipeline_steps.append(("scaler", scaler))
        if self.normalize:
            print("Using Normalizer for feature normalization.")
            pipeline_steps.append(("normalizer", Normalizer()))
        # Necessary, because scaler and normalizer are only GPU ready in 1.8.0 but autogluon requries <1.8.0
        if len(pipeline_steps) > 0 and self.device != "cpu":
            print(f"Adding steps to move data to CPU for preprocessing and back to {self.device} for model fitting.")
            to_cpu = FunctionTransformer(move_data_to_device, kw_args={"device": "cpu"})
            pipeline_steps.insert(0, ("to_cpu", to_cpu))
            to_gpu = FunctionTransformer(move_data_to_device, kw_args={"device": self.device})
            pipeline_steps.append(("to_device", to_gpu))
        pipeline_steps.append(("model", model_hpo_config.model))
        return Pipeline(pipeline_steps)
    def build_search(
        self, pipeline: Pipeline, model_hpo_config: ModelHPOConfig, metrics: list[str], refit: str
    ) -> RandomizedSearchCV:
        """Build a scikit-learn RandomizedSearchCV based on the settings."""
        if self.task in ["density", "count"]:
            assert self.splitter not in ["stratified_shuffle", "stratified_kfold"], (
                f"Splitter {self.splitter} is not viable for regression tasks"
            )
        cv = get_splitter(self.splitter, n_splits=5)
        print(f"Using {cv.__class__.__name__} for cross-validation splitting.")
        search_space = {f"model__{k}": v for k, v in model_hpo_config.search_space.items()}
        if self.search == "random":
            hp_search = RandomizedSearchCV(
                estimator=pipeline,
                param_distributions=search_space,
                n_iter=self.n_iter,
                cv=cv,
                scoring=metrics,
                refit=refit,
                random_state=42,
                verbose=2,
            )
        else:
            raise NotImplementedError(f"Search method {self.search} not implemented yet.")
        return hp_search
 def _extract_cv_results(cv_results) -> pd.DataFrame:
    cv_results = pd.DataFrame(cv_results)
    # Parse the params into individual columns
    params = pd.json_normalize(cv_results["params"])  # ty:ignore[invalid-argument-type]
    cv_results = pd.concat([cv_results.drop(columns=["params"]), params], axis=1)
    return cv_results
 def _compute_metrics(y: SplittedArrays, y_pred: SplittedArrays, metrics: list[str]) -> pd.DataFrame:
    m = []
    for metric in metrics:
        metric_fn = metric_functions[metric]
        for split in ["train", "test", "combined"]:
            value = metric_fn(getattr(y, split), getattr(y_pred, split))
            m.append({"metric": metric, "split": split, "value": value})
    return pd.DataFrame(m)
 def _compute_confusion_matrices(
    y: SplittedArrays, y_pred: SplittedArrays, codes: np.ndarray, labels: list[str] | None
 ) -> xr.Dataset:
    if labels is None:
        labels = [str(code) for code in codes]
    cm = xr.Dataset(
        {
            "test": (("true_label", "predicted_label"), confusion_matrix(y.test, y_pred.test, labels=codes)),
            "train": (("true_label", "predicted_label"), confusion_matrix(y.train, y_pred.train, labels=codes)),
            "combined": (
                ("true_label", "predicted_label"),
                confusion_matrix(y.combined, y_pred.combined, labels=codes),
            ),
        },
        coords={"true_label": labels, "predicted_label": labels},
    )
    return cm
 def _compute_feature_importance(model: Model, best_estimator: Pipeline, training_data: TrainingSet) -> pd.DataFrame:
    X_test = training_data.X.as_numpy().test if model in ["xgboost", "espa"] else training_data.X.test  # noqa: N806
    y_test = training_data.y.as_numpy().test if model in ["xgboost", "espa"] else training_data.y.test
    if model == "espa":
        # eSPA is super slow with cupy arrays, so we need to move the data to CPU for permutation importance
        # To to this, we manually recreate the pipeline but without the device steps
        if "scaler" in best_estimator.named_steps:
            X_test = best_estimator.named_steps["scaler"].transform(X_test)  # noqa: N806
        if "normalizer" in best_estimator.named_steps:
            X_test = best_estimator.named_steps["normalizer"].transform(X_test)  # noqa: N806
        best_estimator.named_steps["model"].to_numpy()  # inplace
    r = permutation_importance(
        best_estimator if model != "espa" else best_estimator.named_steps["model"],
        X_test,
        y_test,
        n_repeats=5,
        random_state=0,
        max_samples=min(5000, training_data.X.test.shape[0]),
    )
    if model == "espa":
        best_estimator.named_steps["model"].to_torch()  # inplace
    feature_importances = pd.DataFrame(
        {
            "importance": r["importances_mean"],
            "stddev": r["importances_std"],
        },
        index=training_data.feature_names,
    )
    match model:
        case "espa":
            model_feature_importances = extract_espa_feature_importance(
                best_estimator.named_steps["model"], training_data
            ).rename(columns={"importance": "model_feature_weights"})
        case "xgboost":
            model_feature_importances = extract_xgboost_feature_importance(
                best_estimator.named_steps["model"], training_data
            )
        case "rf":
            model_feature_importances = extract_rf_feature_importance(
                best_estimator.named_steps["model"], training_data
            )
        case _:
            model_feature_importances = None
    if model_feature_importances is not None:
        feature_importances = feature_importances.join(model_feature_importances)
    return feature_importances
 def _compute_shap_explanation(model: Model, best_estimator: Pipeline, training_data: TrainingSet) -> Explanation:
    match model:
        case "espa" | "knn" | "rf":  # CUML models do not yet work with TreeExplainer...
            train_transformed = training_data.X.as_numpy().train
            if "scaler" in best_estimator.named_steps:
                train_transformed = best_estimator.named_steps["scaler"].transform(train_transformed)
            if "normalizer" in best_estimator.named_steps:
                train_transformed = best_estimator.named_steps["normalizer"].transform(train_transformed)
            masker = shap.maskers.Independent(data=train_transformed)
            def _model_predict(data):
                mdl = best_estimator.named_steps["model"]
                f = mdl.predict_proba if hasattr(mdl, "predict_proba") else mdl.predict
                out = f(data)
                if isinstance(out, torch.Tensor):
                    out = out.cpu().numpy()
                elif isinstance(out, cp.ndarray):
                    out = out.get()
                return out
            explainer = Explainer(
                _model_predict,
                masker=masker,
                seed=42,
                feature_names=training_data.feature_names,
                output_names=training_data.target_labels,
            )
        case "xgboost":
            explainer = TreeExplainer(best_estimator.named_steps["model"], feature_names=training_data.feature_names)
        case _:
            raise ValueError(f"Unknown model: {model}")
    samples = training_data.X.as_numpy().test
    if len(samples) > 200:
        rng = np.random.default_rng(seed=42)
        sample_indices = rng.choice(len(samples), size=200, replace=False)
        samples = samples[sample_indices]
    if "scaler" in best_estimator.named_steps:
        samples = best_estimator.named_steps["scaler"].transform(samples)
    if "normalizer" in best_estimator.named_steps:
        samples = best_estimator.named_steps["normalizer"].transform(samples)
    explanation = explainer(samples)
    return explanation
@cli.default
 def hpsearch_cv(
    dataset_ensemble: DatasetEnsemble,
    settings: RunSettings = RunSettings(),
    experiment: str | None = None,
 ) -> Training:
    """Perform random cross-validation on the training dataset.
    Args:
        dataset_ensemble (DatasetEnsemble): The dataset ensemble configuration.
        settings (RunSettings): This runs settings.
        experiment (str | None): Optional experiment name for results directory.
    """
    results_dir = get_training_results_dir(
        experiment=experiment,
        name="random_search",
        grid=dataset_ensemble.grid,
        level=dataset_ensemble.level,
        task=settings.task,
        target=settings.target,
        model_type=settings.model,
    )
    # Since we use cuml and xgboost libraries, we can only enable array API for ESPA
    use_array_api = settings.model != "xgboost"
    set_config(array_api_dispatch=use_array_api)
    print("Creating training data...")
    training_data = dataset_ensemble.create_training_set(
        task=settings.task, target=settings.target, device=settings.device
    )
    model_hpo_config = get_model_hpo_config(settings.model, settings.task)
    print(f"Using model: {settings.model} with parameters: {model_hpo_config.hp_config}")
    metrics, refit = get_metrics(settings.task)
    print(f"Using {len(metrics)} metrics as scoring and {refit} for refitting.")
    pipeline = settings.build_pipeline(model_hpo_config)
    print(f"Pipeline steps: {pipeline.named_steps}")
    hp_search = settings.build_search(pipeline, model_hpo_config, metrics, refit)
    print(f"Starting hyperparameter search with {settings.n_iter} iterations...")
    with stopwatch(f"RandomizedSearchCV fitting for {settings.n_iter} candidates"):
        fit_params = {f"model__{k}": v for k, v in model_hpo_config.fit_params.items()}
        hp_search.fit(
            training_data.X.train,
            # XGBoost returns it's labels as numpy arrays instead of cupy arrays
            # Thus, for the scoring to work, we need to convert them back to numpy
            training_data.y.as_numpy().train if settings.model == "xgboost" else training_data.y.train,
            **fit_params,
        )
    print("Best parameters combination found:")
    best_estimator = cast(Pipeline, hp_search.best_estimator_)
    best_parameters = best_estimator.get_params()
    for param_name in sorted(model_hpo_config.hp_config.keys()):
        search_param_name = f"model__{param_name}"
        print(f"{param_name}: {best_parameters[search_param_name]}")
    # Compute predictions on the all sets and move them to numpy for metric computations
    y_pred = SplittedArrays(
        train=best_estimator.predict(training_data.X.train),
        test=best_estimator.predict(training_data.X.test),
    ).as_numpy()
    y = training_data.y.as_numpy()
    metrics = _compute_metrics(y, y_pred, metrics)
    if settings.task in ["binary", "count_regimes", "density_regimes"]:
        confusion_matrix = _compute_confusion_matrices(
            y, y_pred, codes=np.array(training_data.target_codes), labels=training_data.target_labels
        )
    else:
        confusion_matrix = None
    print("Metrics computed!")
    with stopwatch("Computing feature importance with permutation importance..."):
        feature_importance = _compute_feature_importance(settings.model, best_estimator, training_data)
    with stopwatch("Explaining model predictions with SHAP..."):
        explanation = _compute_shap_explanation(settings.model, best_estimator, training_data)
    print("Predicting probabilities for all cells...")
    preds = predict_proba(dataset_ensemble, model=best_estimator, task=settings.task, device=settings.device)
    print(f"Predicted probabilities DataFrame with {len(preds)} entries.")
    summary = Training(
        path=results_dir,
        dataset=dataset_ensemble,
        method=HPOCV(
            method=settings.search,
            splitter=settings.splitter,
            scaler=settings.scaler,
            normalize=settings.normalize,
            n_iter=settings.n_iter,
            hpconfig=model_hpo_config.hp_config,
        ),
        task=settings.task,
        target=settings.target,
        training_set=training_data,
        model=best_estimator,
        model_type=settings.model,
        metrics=metrics,
        feature_importance=feature_importance,
        shap_explanation=explanation,
        predictions=preds,
        confusion_matrix=confusion_matrix,
        cv_results=_extract_cv_results(hp_search.cv_results_),
        leaderboard=None,
    )
    summary.save()
    return summary
--- a/src/entropice/ml/inference.py
+++ b/src/entropice/ml/inference.py
@ -5,13 +5,17 @@ from typing import Literal
 import cupy as cp
 import geopandas as gpd
 import numpy as np
 import pandas as pd
 import torch
 from autogluon.tabular import TabularPredictor
 from rich import pretty, traceback
 from sklearn import set_config
 from sklearn.pipeline import Pipeline
 from entropice.ml.dataset import DatasetEnsemble
-from entropice.ml.models import SupportedModel
+from entropice.ml.models import SupportedModel, is_classifier, is_regressor
 from entropice.utils.types import Task
 traceback.install()
 pretty.install()
@ -19,16 +23,53 @@ pretty.install()
 set_config(array_api_dispatch=True)
 def _assert_right_model_for_task(model: Pipeline | SupportedModel | TabularPredictor, task: Task) -> None:
    assert hasattr(model, "predict") or hasattr(model, "predict_proba"), (
        "Model must have a predict or predict_proba method"
    )
    is_classification = task in ["binary", "count_regimes", "density_regimes"]
    is_regression = task in ["count", "density"]
    assert is_classification != is_regression, "Task must be either classification or regression"
    assert is_classifier(model) == is_classification, f"Model type does not match task type for task {task}"
    assert is_regressor(model) == is_regression, f"Model type does not match task type for task {task}"
 def _categorize_predictions(
    preds: pd.Series | np.ndarray,
    task: Task,
 ) -> pd.Series | pd.Categorical:
    """Convert the raw model predictions into a category type series."""
    if isinstance(preds, np.ndarray):
        preds = pd.Series(preds)
    match task:
        case "binary" | "count_regimes" | "density_regimes":
            labels_dict = {
                "binary": ["No RTS", "RTS"],
                "count_regimes": ["None", "Very Few", "Few", "Several", "Many", "Very Many"],
                "density_regimes": ["Empty", "Very Sparse", "Sparse", "Moderate", "Dense", "Very Dense"],
            }
            categories = pd.CategoricalDtype(categories=labels_dict[task], ordered=task != "binary")
            # Check if preds are codes or labels
            if preds.dtype == "object" or isinstance(preds.iloc[0], str):
                return pd.Categorical(preds, dtype=categories)
            else:
                return pd.Categorical.from_codes(preds.astype(int).to_list(), dtype=categories)
        case _:
            return preds
 def predict_proba(
    e: DatasetEnsemble,
-    model: SupportedModel,
+    model: Pipeline | SupportedModel | TabularPredictor,
    task: Task,
    device: Literal["cpu", "cuda", "torch"] = "cuda",
 ) -> gpd.GeoDataFrame:
    """Get predicted probabilities for each cell.
    Args:
        e (DatasetEnsemble): The dataset ensemble configuration.
-        model: SupportedModel: The trained model to use for predictions.
+        model: SupportedModel | TabularPredictor: The trained model to use for predictions.
        task (Task): The task.
        device (Literal["cpu", "cuda", "torch"]): The device to use for predictions.
            This must match with the state of the model!
@ -36,6 +77,7 @@ def predict_proba(
        gpd.GeoDataFrame: A GeoDataFrame with cell_id, predicted probability, and geometry.
    """
    _assert_right_model_for_task(model, task)
    # Predict in batches to avoid memory issues
    batch_size = 50000
    preds = []
@ -52,16 +94,27 @@ def predict_proba(
        cell_ids = batch.index.to_numpy()
        cell_geoms = grid_gdf.loc[batch.index, "geometry"].to_numpy()
-        X_batch = batch.to_numpy(dtype="float64")
+        if isinstance(model, TabularPredictor):
-        if device == "torch":
+            print(f"Predicting batch of size {len(batch)} ({type(batch)}) with AutoGluon TabularPredictor...")
-            X_batch = torch.from_numpy(X_batch).to("cuda")
+            batch_preds = model.predict(batch)
-        elif device == "cuda":
+            print(f"Batch predictions type: {type(batch_preds)}, shape: {batch_preds.shape}")
-            X_batch = cp.asarray(X_batch)
+
-        batch_preds = model.predict(X_batch)
+            assert isinstance(batch_preds, pd.DataFrame | pd.Series), (
-        if isinstance(batch_preds, cp.ndarray):
+                "AutoGluon predict should return a DataFrame or Series"
-            batch_preds = batch_preds.get()
+            )
-        elif torch.is_tensor(batch_preds):
+            batch_preds = batch_preds.to_numpy()
-            batch_preds = batch_preds.cpu().numpy()
+        else:
            X_batch = batch.to_numpy(dtype="float64")
            if device == "torch":
                X_batch = torch.from_numpy(X_batch).to("cuda")
            elif device == "cuda":
                X_batch = cp.asarray(X_batch)
            batch_preds = model.predict(X_batch)
            if isinstance(batch_preds, cp.ndarray):
                batch_preds = batch_preds.get()
            elif torch.is_tensor(batch_preds):
                batch_preds = batch_preds.cpu().numpy()
            batch_preds = _categorize_predictions(batch_preds, task=task)
        batch_preds = gpd.GeoDataFrame(
            {
                "cell_id": cell_ids,
--- a/src/entropice/ml/models.py
+++ b/src/entropice/ml/models.py
@ -3,16 +3,30 @@
 from dataclasses import dataclass, field
 from typing import TypedDict
 import cupy as cp
 import pandas as pd
 import scipy.stats
 import torch
 import xarray as xr
 from autogluon.tabular import TabularPredictor
 from cuml.ensemble import RandomForestClassifier, RandomForestRegressor
 from cuml.neighbors import KNeighborsClassifier, KNeighborsRegressor
 from entropy import ESPAClassifier
 from scipy.stats._distn_infrastructure import rv_continuous_frozen, rv_discrete_frozen
 from sklearn.model_selection import (
    GroupKFold,
    GroupShuffleSplit,
    KFold,
    ShuffleSplit,
    StratifiedGroupKFold,
    StratifiedKFold,
    StratifiedShuffleSplit,
 )
 from sklearn.pipeline import Pipeline
 from xgboost.sklearn import XGBClassifier, XGBRegressor
 from entropice.ml.dataset import TrainingSet
-from entropice.utils.types import Task
+from entropice.utils.types import Splitter, Task
 class Distribution(TypedDict):
@ -35,6 +49,47 @@ type SupportedModel = (
 )
 def get_search_space(hp_config: HPConfig) -> dict[str, list | rv_continuous_frozen | rv_discrete_frozen]:
    """Convert the HPConfig into a search space dictionary usable by sklearn's RandomizedSearchCV."""
    search_space = {}
    for key, dist in hp_config.items():
        if isinstance(dist, list):
            search_space[key] = dist
            continue
        assert hasattr(scipy.stats, dist["distribution"]), (
            f"Unknown distribution type for {key}: {dist['distribution']}"
        )
        distfn = getattr(scipy.stats, dist["distribution"])
        search_space[key] = distfn(dist["low"], dist["high"])
    return search_space
 def is_classifier(model: Pipeline | SupportedModel | TabularPredictor) -> bool:
    """Check if the model is a classifier."""
    if isinstance(model, TabularPredictor):
        return model.problem_type in ["binary", "multiclass"]
    if isinstance(model, Pipeline):
        # Check the last step of the pipeline for the model type
        return is_classifier(model.steps[-1][1])
    return isinstance(
        model,
        (ESPAClassifier, XGBClassifier, RandomForestClassifier, KNeighborsClassifier),
    )
 def is_regressor(model: Pipeline | SupportedModel | TabularPredictor) -> bool:
    """Check if the model is a regressor."""
    if isinstance(model, TabularPredictor):
        return model.problem_type in ["regression"]
    if isinstance(model, Pipeline):
        # Check the last step of the pipeline for the model type
        return is_regressor(model.steps[-1][1])
    return isinstance(
        model,
        (XGBRegressor, RandomForestRegressor, KNeighborsRegressor),
    )
@dataclass(frozen=True)
 class ModelHPOConfig:
    """Model - Hyperparameter Optimization Configuration."""
@ -46,32 +101,43 @@ class ModelHPOConfig:
    @property
    def search_space(self) -> dict[str, list | rv_continuous_frozen | rv_discrete_frozen]:
        """Convert the HPConfig into a search space dictionary usable by sklearn's RandomizedSearchCV."""
-        search_space = {}
+        return get_search_space(self.hp_config)
        for key, dist in self.hp_config.items():
            if isinstance(dist, list):
                search_space[key] = dist
                continue
            assert hasattr(scipy.stats, dist["distribution"]), (
                f"Unknown distribution type for {key}: {dist['distribution']}"
            )
            distfn = getattr(scipy.stats, dist["distribution"])
            search_space[key] = distfn(dist["low"], dist["high"])
        return search_space
 # Hardcode Search Settings for now
 espa_hpconfig: HPConfig = {
    "eps_cl": {"distribution": "loguniform", "low": 1e-11, "high": 1e-6},
    "eps_e": {"distribution": "loguniform", "low": 1e4, "high": 1e8},
-    "initial_K": {"distribution": "randint", "low": 400, "high": 800},
+    "initial_K": {"distribution": "randint", "low": 50, "high": 800},
 }
 xgboost_hpconfig: HPConfig = {
-    "learning_rate": {"distribution": "loguniform", "low": 1e-3, "high": 1e-1},
+    # Learning & Regularization
-    "n_estimators": {"distribution": "randint", "low": 50, "high": 2000},
+    "learning_rate": {"distribution": "loguniform", "low": 0.01, "high": 0.3},
    "n_estimators": {"distribution": "randint", "low": 100, "high": 500},
    # Tree Structure (critical for overfitting control)
    "max_depth": {"distribution": "randint", "low": 3, "high": 10},
    # "min_child_weight": {"distribution": "randint", "low": 1, "high": 10},
    # Feature Sampling (important for high-dimensional data)
    "colsample_bytree": {"distribution": "uniform", "low": 0.3, "high": 1.0},
    # "colsample_bylevel": {"distribution": "uniform", "low": 0.3, "high": 1.0},
    # Row Sampling
    # "subsample": {"distribution": "uniform", "low": 0.5, "high": 1.0},
    # Regularization
    # "reg_alpha": {"distribution": "loguniform", "low": 1e-8, "high": 1.0},  # L1
    "reg_lambda": {"distribution": "loguniform", "low": 1e-8, "high": 10.0},  # L2
 }
 rf_hpconfig: HPConfig = {
-    "max_depth": {"distribution": "randint", "low": 5, "high": 50},
+    # Tree Structure
-    "n_estimators": {"distribution": "randint", "low": 50, "high": 1000},
+    "max_depth": {"distribution": "randint", "low": 10, "high": 50},
    "n_estimators": {"distribution": "randint", "low": 50, "high": 300},
    # Split Criteria (critical for >100 features)
    "max_features": {"distribution": "uniform", "low": 0.1, "high": 0.5},  # sqrt(100) ≈ 10% of features
    "min_samples_split": {"distribution": "randint", "low": 2, "high": 20},
    # "min_samples_leaf": {"distribution": "randint", "low": 1, "high": 10},
    # Regularization (cuML specific)
    # "min_impurity_decrease": {"distribution": "loguniform", "low": 1e-7, "high": 1e-2},
    # Bootstrap
    # "max_samples": {"distribution": "uniform", "low": 0.5, "high": 1.0},
 }
 knn_hpconfig: HPConfig = {
    "n_neighbors": {"distribution": "randint", "low": 10, "high": 200},
@ -134,7 +200,7 @@ def get_model_hpo_config(model: str, task: Task, **model_kwargs) -> ModelHPOConf
            raise ValueError(f"Unsupported model/task combination: {model}/{task}")
-def extract_espa_feature_importance(model: ESPAClassifier, training_data: TrainingSet) -> xr.Dataset:
+def extract_espa_state(model: ESPAClassifier, training_data: TrainingSet) -> xr.Dataset:
    """Extract the inner state of a trained ESPAClassifier as an xarray Dataset."""
    # Annotate the state with xarray metadata
    boxes = list(range(model.K_))
@ -157,7 +223,7 @@ def extract_espa_feature_importance(model: ESPAClassifier, training_data: Traini
        dims=["feature"],
        coords={"feature": training_data.feature_names},
        name="feature_weights",
-        attrs={"description": "Feature weights for each box."},
+        attrs={"description": "Weights for each feature."},
    )
    state = xr.Dataset(
        {
@ -172,130 +238,68 @@ def extract_espa_feature_importance(model: ESPAClassifier, training_data: Traini
    return state
-def extract_xgboost_feature_importance(model: XGBClassifier | XGBRegressor, training_data: TrainingSet) -> xr.Dataset:
+def extract_espa_feature_importance(model: ESPAClassifier, training_data: TrainingSet) -> pd.DataFrame:
-    """Extract feature importance from a trained XGBoost model as an xarray Dataset."""
+    """Extract feature importance from a trained ESPAClassifier as a pandas DataFrame."""
-    # Extract XGBoost-specific information
+    weights = model.W_
-    # Get the underlying booster
+    if isinstance(weights, torch.Tensor):
        weights = weights.cpu().numpy()
    elif isinstance(weights, cp.ndarray):
        weights = weights.get()
    return pd.DataFrame(
        {"importance": weights},
        index=training_data.feature_names,
    )
 def extract_xgboost_feature_importance(model: XGBClassifier | XGBRegressor, training_data: TrainingSet) -> pd.DataFrame:
    """Extract feature importance from a trained XGBoost model as a pandas DataFrame."""
    booster = model.get_booster()
-
+    fi = {}
-    # Feature importance with different importance types
+    for metric in ["weight", "gain", "cover", "total_gain", "total_cover"]:
-    # Note: get_score() returns dict with keys like 'f0', 'f1', etc. (feature indices)
+        scores = booster.get_score(importance_type=metric)
-    importance_weight = booster.get_score(importance_type="weight")
+        fi[metric] = [scores.get(f"f{i}", 0.0) for i in range(len(training_data.feature_names))]
-    importance_gain = booster.get_score(importance_type="gain")
+    return pd.DataFrame(fi, index=training_data.feature_names)
    importance_cover = booster.get_score(importance_type="cover")
    importance_total_gain = booster.get_score(importance_type="total_gain")
    importance_total_cover = booster.get_score(importance_type="total_cover")
    # Create aligned arrays for all features (including zero-importance)
    def align_importance(importance_dict, features):
        """Align importance dict to feature list, filling missing with 0.
        XGBoost returns feature indices (f0, f1, ...) as keys, so we need to map them.
        """
        return [importance_dict.get(f"f{i}", 0.0) for i in range(len(features))]
    feature_importance_weight = xr.DataArray(
        align_importance(importance_weight, training_data.feature_names),
        dims=["feature"],
        coords={"feature": training_data.feature_names},
        name="feature_importance_weight",
        attrs={"description": "Number of times a feature is used to split the data across all trees."},
    )
    feature_importance_gain = xr.DataArray(
        align_importance(importance_gain, training_data.feature_names),
        dims=["feature"],
        coords={"feature": training_data.feature_names},
        name="feature_importance_gain",
        attrs={"description": "Average gain across all splits the feature is used in."},
    )
    feature_importance_cover = xr.DataArray(
        align_importance(importance_cover, training_data.feature_names),
        dims=["feature"],
        coords={"feature": training_data.feature_names},
        name="feature_importance_cover",
        attrs={"description": "Average coverage across all splits the feature is used in."},
    )
    feature_importance_total_gain = xr.DataArray(
        align_importance(importance_total_gain, training_data.feature_names),
        dims=["feature"],
        coords={"feature": training_data.feature_names},
        name="feature_importance_total_gain",
        attrs={"description": "Total gain across all splits the feature is used in."},
    )
    feature_importance_total_cover = xr.DataArray(
        align_importance(importance_total_cover, training_data.feature_names),
        dims=["feature"],
        coords={"feature": training_data.feature_names},
        name="feature_importance_total_cover",
        attrs={"description": "Total coverage across all splits the feature is used in."},
    )
    # Store tree information
    n_trees = booster.num_boosted_rounds()
    state = xr.Dataset(
        {
            "feature_importance_weight": feature_importance_weight,
            "feature_importance_gain": feature_importance_gain,
            "feature_importance_cover": feature_importance_cover,
            "feature_importance_total_gain": feature_importance_total_gain,
            "feature_importance_total_cover": feature_importance_total_cover,
        },
        attrs={
            "description": "Inner state of the best XGBClassifier from RandomizedSearchCV.",
            "n_trees": n_trees,
            "objective": str(model.objective),
        },
    )
    return state
 def extract_rf_feature_importance(
    model: RandomForestClassifier | RandomForestRegressor, training_data: TrainingSet
-) -> xr.Dataset:
+) -> pd.DataFrame:
-    """Extract feature importance from a trained RandomForest model as an xarray Dataset."""
+    """Extract feature importance from a trained RandomForest model as a pandas DataFrame."""
    # Extract Random Forest-specific information
    # Note: cuML's RandomForestClassifier doesn't expose individual trees (estimators_)
    # like sklearn does, so we can only extract feature importances and model parameters
    # Feature importances (Gini importance)
-    feature_importances = model.feature_importances_
+    feature_importances = {"gini": model.feature_importances_}
    return pd.DataFrame(feature_importances, index=training_data.feature_names)
    feature_importance = xr.DataArray(
        feature_importances,
        dims=["feature"],
        coords={"feature": training_data.feature_names},
        name="feature_importance",
        attrs={"description": "Gini importance (impurity-based feature importance)."},
    )
-    # cuML RF doesn't expose individual trees, so we store model parameters instead
+def get_splitter(
-    n_estimators = model.n_estimators
+    splitter: Splitter, n_splits: int
-    max_depth = model.max_depth
+) -> (
-
+    KFold
-    # OOB score if available
+    | StratifiedKFold
-    oob_score = None
+    | ShuffleSplit
-    if hasattr(model, "oob_score_") and model.oob_score:
+    | StratifiedShuffleSplit
-        oob_score = float(model.oob_score_)
+    | GroupKFold
-
+    | StratifiedGroupKFold
-    # cuML RandomForest doesn't provide per-tree statistics like sklearn
+    | GroupShuffleSplit
-    # Store what we have: feature importances and model configuration
+):
-    attrs = {
+    """Get a scikit-learn splitter object based on the specified splitter type."""
-        "description": "Inner state of the best RandomForestClassifier from RandomizedSearchCV (cuML).",
+    match splitter:
-        "n_estimators": int(n_estimators),
+        case "kfold":
-        "note": "cuML RandomForest does not expose individual tree statistics like sklearn",
+            return KFold(n_splits=n_splits, shuffle=True, random_state=42)
-    }
+        case "shuffle":
-
+            return ShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=42)
-    # Only add optional attributes if they have values
+        case "stratified":
-    if max_depth is not None:
+            return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
-        attrs["max_depth"] = int(max_depth)
+        case "stratified_shuffle":
-    if oob_score is not None:
+            return StratifiedShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=42)
-        attrs["oob_score"] = oob_score
+        case "group":
-
+            return GroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
-    state = xr.Dataset(
+        case "stratified_group":
-        {
+            return StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
-            "feature_importance": feature_importance,
+        case "shuffle_group":
-        },
+            return GroupShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=42)
-        attrs=attrs,
+        case _:
-    )
+            raise ValueError(f"Unsupported splitter type: {splitter}")
    return state
--- a/src/entropice/ml/randomsearch.py
+++ b/src/entropice/ml/randomsearch.py
@ -1,8 +1,7 @@
-"""Training of classification models training."""
+"""DEPRECATED!!! Training of classification models training."""
 import pickle
 from dataclasses import asdict, dataclass
 from functools import partial
 from pathlib import Path
 import cyclopts
@ -13,15 +12,7 @@ import xarray as xr
 from rich import pretty, traceback
 from sklearn import set_config
 from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    jaccard_score,
    mean_absolute_error,
    mean_squared_error,
    precision_score,
    r2_score,
    recall_score,
 )
 from sklearn.model_selection import KFold, RandomizedSearchCV
 from stopuhr import stopwatch
@ -30,11 +21,13 @@ from entropice.ml.dataset import DatasetEnsemble, SplittedArrays
 from entropice.ml.inference import predict_proba
 from entropice.ml.models import (
    extract_espa_feature_importance,
    extract_espa_state,
    extract_rf_feature_importance,
    extract_xgboost_feature_importance,
    get_model_hpo_config,
 )
-from entropice.utils.paths import get_cv_results_dir
+from entropice.utils.metrics import get_metrics, metric_functions
 from entropice.utils.paths import get_training_results_dir
 from entropice.utils.types import Model, TargetDataset, Task
 traceback.install()
@ -44,56 +37,9 @@ pretty.install()
 cli = cyclopts.App("entropice-training", config=cyclopts.config.Toml("training-config.toml"))  # ty:ignore[invalid-argument-type]
 def _get_metrics(task: Task) -> tuple[list[str], str]:
    """Get the list of metrics for a given task."""
    if task == "binary":
        return ["accuracy", "recall", "precision", "f1", "jaccard"], "f1"
    elif task in ["count_regimes", "density_regimes"]:
        return [
            "accuracy",  # equals "f1_micro", "precision_micro", "recall_micro", "recall_weighted"
            "f1_macro",
            "f1_weighted",
            "precision_macro",
            "precision_weighted",
            "recall_macro",
            "jaccard_micro",
            "jaccard_macro",
            "jaccard_weighted",
        ], "f1_weighted"
    else:
        return [
            "neg_mean_squared_error",
            "neg_mean_absolute_error",
            "r2",
        ], "r2"
 # Compute other metrics - using predictions directly instead of re-predicting for each metric
 # Use functools.partial for cleaner metric definitions with non-default parameters
 _metric_functions = {
    "accuracy": accuracy_score,
    "recall": recall_score,
    "precision": precision_score,
    "f1": f1_score,
    "jaccard": jaccard_score,
    "recall_macro": partial(recall_score, average="macro"),
    "recall_weighted": partial(recall_score, average="weighted"),
    "precision_macro": partial(precision_score, average="macro"),
    "precision_weighted": partial(precision_score, average="weighted"),
    "f1_macro": partial(f1_score, average="macro"),
    "f1_weighted": partial(f1_score, average="weighted"),
    "jaccard_micro": partial(jaccard_score, average="micro"),
    "jaccard_macro": partial(jaccard_score, average="macro"),
    "jaccard_weighted": partial(jaccard_score, average="weighted"),
    "neg_mean_squared_error": mean_squared_error,
    "neg_mean_absolute_error": mean_absolute_error,
    "r2": r2_score,
 }
@cyclopts.Parameter("*")
@dataclass(frozen=True, kw_only=True)
-class CVSettings:
+class RunSettings:
    """Cross-validation settings for model training."""
    n_iter: int = 2000
@ -103,7 +49,7 @@ class CVSettings:
@dataclass(frozen=True, kw_only=True)
-class TrainingSettings(DatasetEnsemble, CVSettings):
+class TrainingSettings(DatasetEnsemble, RunSettings):
    """Helper Wrapper to store combined training and dataset ensemble settings."""
    param_grid: dict
@ -115,14 +61,14 @@ class TrainingSettings(DatasetEnsemble, CVSettings):
@cli.default
 def random_cv(
    dataset_ensemble: DatasetEnsemble,
-    settings: CVSettings = CVSettings(),
+    settings: RunSettings = RunSettings(),
    experiment: str | None = None,
 ) -> Path:
    """Perform random cross-validation on the training dataset.
    Args:
        dataset_ensemble (DatasetEnsemble): The dataset ensemble configuration.
-        settings (CVSettings): The cross-validation settings.
+        settings (RunSettings): The cross-validation settings.
        experiment (str | None): Optional experiment name for results directory.
    """
@ -136,7 +82,7 @@ def random_cv(
    model_hpo_config = get_model_hpo_config(settings.model, settings.task)
    print(f"Using model: {settings.model} with parameters: {model_hpo_config.hp_config}")
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
-    metrics, refit = _get_metrics(settings.task)
+    metrics, refit = get_metrics(settings.task)
    search = RandomizedSearchCV(
        model_hpo_config.model,
        model_hpo_config.search_space,
@ -175,12 +121,14 @@ def random_cv(
    )
    print(f"{refit.replace('_', ' ').capitalize()} on test set: {test_score:.3f}")
-    results_dir = get_cv_results_dir(
+    results_dir = get_training_results_dir(
        experiment=experiment,
        name="random_search",
        grid=dataset_ensemble.grid,
        level=dataset_ensemble.level,
        task=settings.task,
        target=settings.target,
        model_type=settings.model,
    )
    # Store the search settings
@ -221,9 +169,9 @@ def random_cv(
    # Compute and StoreMetrics
    y = training_data.y.as_numpy()
-    test_metrics = {metric: _metric_functions[metric](y.test, y_pred.test) for metric in metrics}
+    test_metrics = {metric: metric_functions[metric](y.test, y_pred.test) for metric in metrics}
-    train_metrics = {metric: _metric_functions[metric](y.train, y_pred.train) for metric in metrics}
+    train_metrics = {metric: metric_functions[metric](y.train, y_pred.train) for metric in metrics}
-    combined_metrics = {metric: _metric_functions[metric](y.combined, y_pred.combined) for metric in metrics}
+    combined_metrics = {metric: metric_functions[metric](y.combined, y_pred.combined) for metric in metrics}
    all_metrics = {
        "test_metrics": test_metrics,
        "train_metrics": train_metrics,
@ -255,31 +203,30 @@ def random_cv(
    # Get the inner state of the best estimator
    if settings.model == "espa":
-        state = extract_espa_feature_importance(best_estimator, training_data)
+        state = extract_espa_state(best_estimator, training_data)
        state_file = results_dir / "best_estimator_state.nc"
        print(f"Storing best estimator state to {state_file}")
        state.to_netcdf(state_file, engine="h5netcdf")
        fi = extract_espa_feature_importance(best_estimator, training_data)
        fi_file = results_dir / "best_estimator_feature_importance.parquet"
        print(f"Storing best estimator feature importance to {fi_file}")
        fi.to_parquet(fi_file)
    elif settings.model == "xgboost":
-        state = extract_xgboost_feature_importance(best_estimator, training_data)
+        fi = extract_xgboost_feature_importance(best_estimator, training_data)
-        state_file = results_dir / "best_estimator_state.nc"
+        fi_file = results_dir / "best_estimator_feature_importance.parquet"
-        print(f"Storing best estimator state to {state_file}")
+        print(f"Storing best estimator feature importance to {fi_file}")
-        state.to_netcdf(state_file, engine="h5netcdf")
+        fi.to_parquet(fi_file)
    elif settings.model == "rf":
-        state = extract_rf_feature_importance(best_estimator, training_data)
+        fi = extract_rf_feature_importance(best_estimator, training_data)
-        state_file = results_dir / "best_estimator_state.nc"
+        fi_file = results_dir / "best_estimator_feature_importance.parquet"
-        print(f"Storing best estimator state to {state_file}")
+        print(f"Storing best estimator feature importance to {fi_file}")
-        state.to_netcdf(state_file, engine="h5netcdf")
+        fi.to_parquet(fi_file)
    # Predict probabilities for all cells
    print("Predicting probabilities for all cells...")
-    preds = predict_proba(dataset_ensemble, model=best_estimator, device=device)
+    preds = predict_proba(dataset_ensemble, model=best_estimator, task=settings.task, device=device)
    if training_data.targets["y"].dtype == "category":
        preds["predicted"] = preds["predicted"].astype("category")
        preds["predicted"] = preds["predicted"].cat.set_categories(
            training_data.targets["y"].cat.categories, ordered=True
        )
    print(f"Predicted probabilities DataFrame with {len(preds)} entries.")
    preds_file = results_dir / "predicted_probabilities.parquet"
    print(f"Storing predicted probabilities to {preds_file}")
--- a/src/entropice/utils/metrics.py
+++ b/src/entropice/utils/metrics.py
@ -0,0 +1,63 @@
 """Metrics for model evaluation and hyperparameter optimization."""
 from functools import partial
 from sklearn.metrics import (
    accuracy_score,
    f1_score,
    jaccard_score,
    mean_absolute_error,
    mean_squared_error,
    precision_score,
    r2_score,
    recall_score,
 )
 from entropice.utils.types import Task
 def get_metrics(task: Task) -> tuple[list[str], str]:
    """Get the list of metrics for a given task."""
    if task == "binary":
        return ["accuracy", "recall", "precision", "f1", "jaccard"], "f1"
    elif task in ["count_regimes", "density_regimes"]:
        return [
            "accuracy",  # equals "f1_micro", "precision_micro", "recall_micro", "recall_weighted"
            "f1_macro",
            "f1_weighted",
            "precision_macro",
            "precision_weighted",
            "recall_macro",
            "jaccard_micro",
            "jaccard_macro",
            "jaccard_weighted",
        ], "f1_weighted"
    else:
        return [
            "neg_mean_squared_error",
            "neg_mean_absolute_error",
            "r2",
        ], "r2"
 # Compute other metrics - using predictions directly instead of re-predicting for each metric
 # Use functools.partial for cleaner metric definitions with non-default parameters
 metric_functions = {
    "accuracy": accuracy_score,
    "recall": recall_score,
    "precision": precision_score,
    "f1": f1_score,
    "jaccard": jaccard_score,
    "recall_macro": partial(recall_score, average="macro"),
    "recall_weighted": partial(recall_score, average="weighted"),
    "precision_macro": partial(precision_score, average="macro"),
    "precision_weighted": partial(precision_score, average="weighted"),
    "f1_macro": partial(f1_score, average="macro"),
    "f1_weighted": partial(f1_score, average="weighted"),
    "jaccard_micro": partial(jaccard_score, average="micro"),
    "jaccard_macro": partial(jaccard_score, average="macro"),
    "jaccard_weighted": partial(jaccard_score, average="weighted"),
    "neg_mean_squared_error": mean_squared_error,
    "neg_mean_absolute_error": mean_absolute_error,
    "r2": r2_score,
 }
--- a/src/entropice/utils/paths.py
+++ b/src/entropice/utils/paths.py
@ -6,7 +6,7 @@ import os
 from pathlib import Path
 from typing import Literal
-from entropice.utils.types import Grid, Task, TemporalMode
+from entropice.utils.types import Grid, Model, TargetDataset, Task, TemporalMode
 DATA_DIR = (
    Path(os.environ.get("FAST_DATA_DIR", None) or os.environ.get("DATA_DIR", None) or "data").resolve() / "entropice"
@ -147,12 +147,14 @@ def get_dataset_stats_cache() -> Path:
    return cache_dir / "dataset_stats.pckl"
-def get_cv_results_dir(
+def get_training_results_dir(
    experiment: str | None,
    name: str,
    grid: Grid,
    level: int,
    task: Task,
    target: TargetDataset,
    model_type: Model | None = None,
 ) -> Path:
    gridname = _get_gridname(grid, level)
    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@ -161,24 +163,9 @@ def get_cv_results_dir(
        experiment_dir.mkdir(parents=True, exist_ok=True)
    else:
        experiment_dir = RESULTS_DIR
-    results_dir = experiment_dir / f"{gridname}_{name}_cv{now}_{task}"
+    parts = [gridname, name, now, task, target]
-    results_dir.mkdir(parents=True, exist_ok=True)
+    if model_type is not None:
-    return results_dir
+        parts.append(model_type)
-
+    results_dir = experiment_dir / "_".join(parts)
 def get_autogluon_results_dir(
    experiment: str | None,
    grid: Grid,
    level: int,
    task: Task,
 ) -> Path:
    gridname = _get_gridname(grid, level)
    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    if experiment is not None:
        experiment_dir = RESULTS_DIR / experiment
        experiment_dir.mkdir(parents=True, exist_ok=True)
    else:
        experiment_dir = RESULTS_DIR
    results_dir = experiment_dir / f"{gridname}_autogluon_{now}_{task}"
    results_dir.mkdir(parents=True, exist_ok=True)
    return results_dir
--- a/src/entropice/utils/training.py
+++ b/src/entropice/utils/training.py
@ -0,0 +1,238 @@
 import pickle
 from dataclasses import asdict, dataclass
 from functools import cached_property
 from pathlib import Path
 from typing import Any, Literal
 import cupy as cp
 import geopandas as gpd
 import numpy as np
 import pandas as pd
 import toml
 import torch
 import xarray as xr
 from shap import Explanation
 from entropice.ml.dataset import DatasetEnsemble, TrainingSet
 from entropice.ml.models import (
    HPConfig,
    extract_espa_state,
    get_search_space,
 )
 from entropice.utils.types import HPSearch, Model, Scaler, Splitter, TargetDataset, Task
 type ndarray = np.ndarray | torch.Tensor | cp.ndarray
 def move_data_to_device(data: ndarray, device: Literal["torch", "cuda", "cpu"]) -> ndarray:
    """Move the given data to the specified device (CPU, CUDA, or PyTorch tensor)."""
    match device:
        case "torch":
            torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            return torch.asarray(data, device=torch_device)
        case "cuda":
            if isinstance(data, cp.ndarray):
                return data
            with cp.cuda.Device(0):
                return cp.asarray(data)
        case "cpu":
            if isinstance(data, np.ndarray):
                return data
            elif isinstance(data, torch.Tensor):
                return data.cpu().numpy()
            elif isinstance(data, cp.ndarray):
                return data.get()
        case _:
            raise ValueError(f"Unknown device: {device}")
@dataclass
 class HPOCV:
    method: HPSearch
    splitter: Splitter
    scaler: Scaler
    normalize: bool
    n_iter: int
    hpconfig: HPConfig
    @property
    def search_space(self):
        return get_search_space(self.hpconfig)
@dataclass
 class AutoML:
    time_budget: int
    preset: str
    hpo: bool
@dataclass
 class Training:
    """Configuration and results of a training run.
    A training run involves the complete approach of training a machine learning model, currently:
    1. A hyperparameter optimization using cross-validation
    2. Use of AutoML techniques
    Thus, a training run is always defined as:
    f(dataset, method) -> (model, metrics)
    Metrics refer to a simple dataframe in long format, with the columns: "metric", "split", "value".
    Split is either "train", "test" or "complete".
    """
    path: Path
    dataset: DatasetEnsemble
    method: HPOCV | AutoML
    task: Task
    target: TargetDataset
    training_set: TrainingSet  # TODO: Store Training Set to improve loading time (?)
    model: Any
    model_type: Model
    metrics: pd.DataFrame
    feature_importance: pd.DataFrame
    shap_explanation: Explanation
    predictions: gpd.GeoDataFrame
    confusion_matrix: xr.Dataset | None  # only for classification tasks
    cv_results: pd.DataFrame | None  # only for HPOCV
    leaderboard: pd.DataFrame | None  # only for AutoGluon
    def __repr__(self) -> str:  # noqa: D105
        return (
            f"Training("
            f"path={self.path}, "
            f"dataset={self.dataset.grid}-{self.dataset.level}{self.dataset.members}, "
            f"method={type(self.method).__name__}, "
            f"task={self.task}, "
            f"target={self.target})"
        )
    @cached_property
    def metric_names(self) -> list[str]:
        """Get the list of metric names from the metrics DataFrame."""
        return self.metrics["metric"].unique().tolist()
    @property
    def get_state(self) -> xr.Dataset | pd.DataFrame | None:
        """Get the inner state of the trained model, if available."""
        if self.model_type == "espa":
            return extract_espa_state(self.model, self.training_set)
        else:
            return None
    def save(self):
        """Save the training results to the specified path."""
        self.path.mkdir(parents=True, exist_ok=True)
        config_file = self.path / "training_config.toml"
        model_file = self.path / "model.pkl"
        metrics_file = self.path / "metrics.parquet"
        feature_importance_file = self.path / "feature_importance.parquet"
        explanations_file = self.path / "shap_explanation.pkl"
        predictions_file = self.path / "predictions.parquet"
        # Save config
        with open(config_file, "w") as f:
            toml.dump(
                {
                    "dataset": asdict(self.dataset),
                    "method": asdict(self.method),
                    "method_type": type(self.method).__name__,
                    "task": self.task,
                    "target": self.target,
                    "model_type": self.model_type,
                },
                f,
            )
        # Save model, metrics, explanations and predictions
        model_file.write_bytes(pickle.dumps(self.model))
        self.metrics.to_parquet(metrics_file)
        self.feature_importance.to_parquet(feature_importance_file)
        explanations_file.write_bytes(pickle.dumps(self.shap_explanation))
        self.predictions.to_parquet(predictions_file)
        # Save the confusion matrix if it exists
        if self.confusion_matrix is not None:
            cm_file = self.path / "confusion_matrix.nc"
            self.confusion_matrix.to_netcdf(cm_file, engine="h5netcdf")
        if self.cv_results is not None:
            results_file = self.path / "search_results.parquet"
            self.cv_results.to_parquet(results_file)
        # Save the leaderboard if it exists
        if self.leaderboard is not None:
            leaderboard_file = self.path / "leaderboard.parquet"
            self.leaderboard.to_parquet(leaderboard_file)
    @classmethod
    def load(cls, path: Path, device: Literal["cpu", "cuda"] = "cpu") -> "Training":
        """Load a training run from the specified path."""
        config_file = path / "training_config.toml"
        model_file = path / "model.pkl"
        metrics_file = path / "metrics.parquet"
        feature_importance_file = path / "feature_importance.parquet"
        predictions_file = path / "predictions.parquet"
        cm_file = path / "confusion_matrix.nc"
        cv_results_file = path / "search_results.parquet"
        leaderboard_file = path / "leaderboard.parquet"
        # Load config
        with open(config_file) as f:
            config = toml.load(f)
        task = config["task"]
        target = config["target"]
        model_type = config["model_type"]
        dataset = DatasetEnsemble(**config["dataset"])
        training_set = dataset.create_training_set(task, target, device)
        method_type = config["method_type"]
        if method_type == "HPOCV":
            method = HPOCV(**config["method"])
        elif method_type == "AutoML":
            method = AutoML(**config["method"])
        else:
            raise ValueError(f"Unknown method type: {method_type}")
        # Load model, metrics, explanations and predictions
        model = pickle.loads(model_file.read_bytes())
        metrics = pd.read_parquet(metrics_file)
        feature_importance = pd.read_parquet(feature_importance_file)
        shap_explanation = pickle.loads((path / "shap_explanation.pkl").read_bytes())
        predictions = gpd.read_parquet(predictions_file)
        # Load confusion matrix if it exists
        confusion_matrix = None
        if cm_file.exists():
            confusion_matrix = xr.load_dataset(cm_file, engine="h5netcdf")
        cv_results = None
        if cv_results_file.exists():
            cv_results = pd.read_parquet(cv_results_file)
        # Load leaderboard if it exists
        leaderboard = None
        if leaderboard_file.exists():
            leaderboard = pd.read_parquet(leaderboard_file)
        return cls(
            path=path,
            dataset=dataset,
            method=method,
            task=task,
            target=target,
            training_set=training_set,
            model=model,
            model_type=model_type,
            metrics=metrics,
            feature_importance=feature_importance,
            shap_explanation=shap_explanation,
            predictions=predictions,
            confusion_matrix=confusion_matrix,
            cv_results=cv_results,
            leaderboard=leaderboard,
        )
--- a/src/entropice/utils/types.py
+++ b/src/entropice/utils/types.py
@ -19,10 +19,15 @@ type TargetDataset = Literal["darts_v1", "darts_mllabels"]
 type L0SourceDataset = Literal["ArcticDEM", "ERA5", "AlphaEarth"]
 type L2SourceDataset = Literal["ArcticDEM", "ERA5-shoulder", "ERA5-seasonal", "ERA5-yearly", "AlphaEarth"]
 type Task = Literal["binary", "count_regimes", "density_regimes", "count", "density"]
-# TODO: Consider implementing a "timeseries" temporal mode
+# TODO: Consider implementing a "timeseries" and "event" temporal mode
 type TemporalMode = Literal["feature", "synopsis", 2018, 2019, 2020, 2021, 2022, 2023]
-type Model = Literal["espa", "xgboost", "rf", "knn"]
+type Model = Literal["espa", "xgboost", "rf", "knn", "autogluon"]
 type Stage = Literal["train", "inference", "visualization"]
 type HPSearch = Literal["random"]  # TODO Grid and Bayesian search
 type Splitter = Literal[
    "kfold", "stratified", "shuffle", "stratified_shuffle", "group", "stratified_group", "shuffle_group"
 ]
 type Scaler = Literal["standard", "minmax", "maxabs", "quantile", "none"]
@dataclass(frozen=True)
--- a/tests/test_training.py
+++ b/tests/test_training.py
@ -29,7 +29,7 @@ import shutil
 import pytest
 from entropice.ml.dataset import DatasetEnsemble
-from entropice.ml.training import CVSettings, random_cv
+from entropice.ml.randomsearch import RunSettings, random_cv
 from entropice.utils.types import Model, Task
@ -90,7 +90,7 @@ class TestRandomCV:
        - All output files are created
        """
        # Use darts_v1 as the primary target for all tests
-        settings = CVSettings(
+        settings = RunSettings(
            n_iter=3,
            task=task,
            target="darts_v1",
@ -141,7 +141,7 @@ class TestRandomCV:
        - xgboost: Uses CUDA without array API dispatch
        - rf/knn: GPU-accelerated via cuML
        """
-        settings = CVSettings(
+        settings = RunSettings(
            n_iter=3,
            task="binary",  # Simple binary task for device testing
            target="darts_v1",
@ -168,7 +168,7 @@ class TestRandomCV:
    def test_random_cv_with_mllabels(self, test_ensemble, cleanup_results):
        """Test random_cv with multi-label target dataset."""
-        settings = CVSettings(
+        settings = RunSettings(
            n_iter=3,
            task="binary",
            target="darts_mllabels",
@ -199,7 +199,7 @@ class TestRandomCV:
            add_lonlat=True,
        )
-        settings = CVSettings(
+        settings = RunSettings(
            n_iter=3,
            task="binary",
            target="darts_v1",