Redo Training Resutls Analysis

2026-01-19 16:35:38 +01:00 · 2026-01-19 16:35:38 +01:00 · 7d874f7f92
commit 7d874f7f92
parent 2664579a75
16 changed files with 1455 additions and 2227 deletions
--- a/src/entropice/dashboard/app.py
+++ b/src/entropice/dashboard/app.py
@ -12,7 +12,6 @@ Pages:
 import streamlit as st
 from entropice.dashboard.views.autogluon_analysis_page import render_autogluon_analysis_page
 from entropice.dashboard.views.dataset_page import render_dataset_page
 from entropice.dashboard.views.inference_page import render_inference_page
 from entropice.dashboard.views.model_state_page import render_model_state_page
@ -28,7 +27,6 @@ def main():
    overview_page = st.Page(render_overview_page, title="Overview", icon="🏡", default=True)
    data_page = st.Page(render_dataset_page, title="Dataset", icon="📊")
    training_analysis_page = st.Page(render_training_analysis_page, title="Training Results Analysis", icon="🦾")
    autogluon_page = st.Page(render_autogluon_analysis_page, title="AutoGluon Analysis", icon="🤖")
    model_state_page = st.Page(render_model_state_page, title="Model State", icon="🧮")
    inference_page = st.Page(render_inference_page, title="Inference", icon="🗺️")
@ -36,7 +34,7 @@ def main():
        {
            "Overview": [overview_page],
            "Data": [data_page],
-            "Experiments": [training_analysis_page, autogluon_page, model_state_page],
+            "Experiments": [training_analysis_page, model_state_page],
            "Inference": [inference_page],
        }
    )
--- a/src/entropice/dashboard/plots/hyperparameter_analysis.py
+++ b/src/entropice/dashboard/plots/hyperparameter_analysis.py
--- a/src/entropice/dashboard/plots/hyperparameter_space.py
+++ b/src/entropice/dashboard/plots/hyperparameter_space.py
@ -0,0 +1,417 @@
 """Hyperparameter space plotting functions."""
 import matplotlib.colors as mcolors
 import pandas as pd
 import plotly.graph_objects as go
 from entropice.dashboard.utils.colors import get_cmap, get_palette
 def plot_performance_summary(results: pd.DataFrame, refit_metric: str) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
    """Compute performance summary statistics.
    Args:
        results: DataFrame with CV results.
        refit_metric: The metric used for refit (e.g., 'f1', 'f1_weighted').
    Returns:
        Tuple of (best_scores_df, score_stats_df, best_params_dict).
    """
    # Get all test score columns
    score_cols = [col for col in results.columns if col.startswith("mean_test_")]
    if not score_cols:
        return pd.DataFrame(), pd.DataFrame(), {}
    # Calculate best scores
    best_scores = []
    for col in score_cols:
        metric_name = col.replace("mean_test_", "").replace("_", " ").title()
        best_score = results[col].max()
        best_scores.append({"Metric": metric_name, "Best Score": f"{best_score:.4f}"})
    # Calculate score statistics
    score_stats = []
    for col in score_cols:
        metric_name = col.replace("mean_test_", "").replace("_", " ").title()
        mean_score = results[col].mean()
        std_score = results[col].std()
        score_stats.append(
            {
                "Metric": metric_name,
                "Mean ± Std": f"{mean_score:.4f} ± {std_score:.4f}",
            }
        )
    # Get best parameter combination
    refit_col = f"mean_test_{refit_metric}"
    if refit_col not in results.columns and score_cols:
        refit_col = score_cols[0]
    best_idx = results[refit_col].idxmax()
    best_row = results.loc[best_idx]
    # Extract parameter columns
    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
    best_params = {col.replace("param_", ""): best_row[col] for col in param_cols}
    return pd.DataFrame(best_scores), pd.DataFrame(score_stats), best_params
 def plot_parameter_distributions(results: pd.DataFrame, param_grid: dict | None = None) -> dict[str, go.Figure]:
    """Create histogram charts for parameter distributions.
    Args:
        results: DataFrame with CV results.
        param_grid: Optional parameter grid with distribution information.
    Returns:
        Dictionary mapping parameter names to Plotly figures.
    """
    # Get parameter columns
    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
    if not param_cols:
        return {}
    cmap = get_cmap("parameter_distribution")
    bar_color = mcolors.rgb2hex(cmap(0.5))
    charts = {}
    for param_col in param_cols:
        param_name = param_col.replace("param_", "")
        param_values = results[param_col].dropna()
        if len(param_values) == 0:
            continue
        # Determine if parameter is numeric
        if pd.api.types.is_numeric_dtype(param_values):
            # Create histogram for numeric parameters
            fig = go.Figure()
            fig.add_trace(
                go.Histogram(
                    x=param_values,
                    nbinsx=30,
                    marker_color=bar_color,
                    name=param_name,
                )
            )
            fig.update_layout(
                title=f"Distribution of {param_name}",
                xaxis_title=param_name,
                yaxis_title="Count",
                height=400,
                showlegend=False,
            )
        else:
            # Create bar chart for categorical parameters
            value_counts = param_values.value_counts().reset_index()
            value_counts.columns = [param_name, "count"]
            fig = go.Figure()
            fig.add_trace(
                go.Bar(
                    x=value_counts[param_name],
                    y=value_counts["count"],
                    marker_color=bar_color,
                    name=param_name,
                )
            )
            fig.update_layout(
                title=f"Distribution of {param_name}",
                xaxis_title=param_name,
                yaxis_title="Count",
                height=400,
                showlegend=False,
            )
        charts[param_name] = fig
    return charts
 def plot_score_vs_parameters(
    results: pd.DataFrame, metric: str, param_grid: dict | None = None
 ) -> dict[str, go.Figure]:
    """Create scatter plots of score vs each parameter.
    Args:
        results: DataFrame with CV results.
        metric: The metric to plot (e.g., 'f1', 'accuracy').
        param_grid: Optional parameter grid with distribution information.
    Returns:
        Dictionary mapping parameter names to Plotly figures.
    """
    score_col = f"mean_test_{metric}"
    if score_col not in results.columns:
        return {}
    # Get parameter columns
    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
    if not param_cols:
        return {}
    # Get colormap
    hex_colors = get_palette(metric, n_colors=256)
    charts = {}
    for param_col in param_cols:
        param_name = param_col.replace("param_", "")
        param_values = results[param_col].dropna()
        if len(param_values) == 0:
            continue
        # Check if this parameter uses log scale
        use_log = False
        if param_grid and param_name in param_grid:
            param_config = param_grid[param_name]
            if isinstance(param_config, dict) and param_config.get("distribution") == "loguniform":
                use_log = True
        # Create scatter plot
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=results[param_col],
                y=results[score_col],
                mode="markers",
                marker={
                    "size": 8,
                    "color": results[score_col],
                    "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
                    "showscale": False,
                    "opacity": 0.6,
                },
                text=[
                    f"{param_name}: {val}<br>Score: {score:.4f}"
                    for val, score in zip(results[param_col], results[score_col])
                ],
                hovertemplate="%{text}<extra></extra>",
            )
        )
        fig.update_layout(
            title=f"{metric.replace('_', ' ').title()} vs {param_name}",
            xaxis_title=param_name,
            xaxis_type="log" if use_log else "linear",
            yaxis_title=metric.replace("_", " ").title(),
            height=400,
            showlegend=False,
        )
        charts[param_name] = fig
    return charts
 def plot_parameter_correlations(results: pd.DataFrame, metric: str) -> go.Figure | None:
    """Create correlation bar chart between parameters and score.
    Args:
        results: DataFrame with CV results.
        metric: The metric to analyze (e.g., 'f1', 'accuracy').
    Returns:
        Plotly figure or None if no numeric parameters found.
    """
    score_col = f"mean_test_{metric}"
    if score_col not in results.columns:
        return None
    # Get numeric parameter columns
    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
    numeric_params = [col for col in param_cols if pd.api.types.is_numeric_dtype(results[col])]
    if not numeric_params:
        return None
    # Calculate correlations
    correlations = []
    for param_col in numeric_params:
        param_name = param_col.replace("param_", "")
        corr = results[[param_col, score_col]].corr().iloc[0, 1]
        correlations.append({"Parameter": param_name, "Correlation": corr})
    corr_df = pd.DataFrame(correlations).sort_values("Correlation", ascending=False)
    # Get colormap (use diverging colormap for correlation)
    hex_colors = get_palette("correlation", n_colors=256)
    # Create bar chart
    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x=corr_df["Correlation"],
            y=corr_df["Parameter"],
            orientation="h",
            marker={
                "color": corr_df["Correlation"],
                "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
                "cmin": -1,
                "cmax": 1,
                "showscale": False,
            },
            text=[f"{c:.3f}" for c in corr_df["Correlation"]],
            hovertemplate="%{y}<br>Correlation: %{x:.3f}<extra></extra>",
        )
    )
    fig.update_layout(
        xaxis_title="Correlation with Score",
        yaxis_title="Parameter",
        height=max(300, len(correlations) * 30),
        showlegend=False,
    )
    return fig
 def plot_parameter_interactions(results: pd.DataFrame, metric: str, param_grid: dict | None = None) -> list[go.Figure]:
    """Create scatter plots showing parameter interactions.
    Args:
        results: DataFrame with CV results.
        metric: The metric to visualize (e.g., 'f1', 'accuracy').
        param_grid: Optional parameter grid with distribution information.
    Returns:
        List of Plotly figures showing parameter interactions.
    """
    score_col = f"mean_test_{metric}"
    if score_col not in results.columns:
        return []
    # Get numeric parameter columns
    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
    numeric_params = [col for col in param_cols if pd.api.types.is_numeric_dtype(results[col])]
    if len(numeric_params) < 2:
        return []
    # Get colormap
    hex_colors = get_palette(metric, n_colors=256)
    # Create scatter plots for parameter pairs
    charts = []
    param_names = [col.replace("param_", "") for col in numeric_params]
    for i, x_param in enumerate(param_names[:-1]):
        for y_param in param_names[i + 1 :]:
            x_col = f"param_{x_param}"
            y_col = f"param_{y_param}"
            # Check if parameters use log scale
            x_use_log = False
            y_use_log = False
            if param_grid:
                if x_param in param_grid:
                    x_config = param_grid[x_param]
                    if isinstance(x_config, dict) and x_config.get("distribution") == "loguniform":
                        x_use_log = True
                if y_param in param_grid:
                    y_config = param_grid[y_param]
                    if isinstance(y_config, dict) and y_config.get("distribution") == "loguniform":
                        y_use_log = True
            fig = go.Figure()
            fig.add_trace(
                go.Scatter(
                    x=results[x_col],
                    y=results[y_col],
                    mode="markers",
                    marker={
                        "size": 8,
                        "color": results[score_col],
                        "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
                        "showscale": True,
                        "colorbar": {"title": metric.replace("_", " ").title()},
                        "opacity": 0.7,
                    },
                    text=[
                        f"{x_param}: {x_val}<br>{y_param}: {y_val}<br>Score: {score:.4f}"
                        for x_val, y_val, score in zip(results[x_col], results[y_col], results[score_col])
                    ],
                    hovertemplate="%{text}<extra></extra>",
                )
            )
            fig.update_layout(
                title=f"{metric.replace('_', ' ').title()} by {x_param} and {y_param}",
                xaxis_title=x_param,
                xaxis_type="log" if x_use_log else "linear",
                yaxis_title=y_param,
                yaxis_type="log" if y_use_log else "linear",
                height=500,
                width=500,
            )
            charts.append(fig)
    return charts
 def plot_score_evolution(results: pd.DataFrame, metric: str) -> go.Figure | None:
    """Create line chart showing score evolution over iterations.
    Args:
        results: DataFrame with CV results.
        metric: The metric to visualize (e.g., 'f1', 'accuracy').
    Returns:
        Plotly figure or None if metric not found.
    """
    score_col = f"mean_test_{metric}"
    if score_col not in results.columns:
        return None
    # Add iteration number
    iterations = list(range(len(results)))
    scores = results[score_col].to_numpy()
    best_so_far = results[score_col].cummax().to_numpy()
    # Get colormap
    cmap = get_cmap("score_evolution")
    score_color = mcolors.rgb2hex(cmap(0.3))
    best_color = mcolors.rgb2hex(cmap(0.7))
    # Create line chart
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=iterations,
            y=scores,
            mode="lines",
            name="Score",
            line={"color": score_color, "width": 1},
            opacity=0.6,
            hovertemplate="Iteration: %{x}<br>Score: %{y:.4f}<extra></extra>",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=iterations,
            y=best_so_far,
            mode="lines",
            name="Best So Far",
            line={"color": best_color, "width": 2},
            hovertemplate="Iteration: %{x}<br>Best So Far: %{y:.4f}<extra></extra>",
        )
    )
    fig.update_layout(
        title=f"{metric.replace('_', ' ').title()} Evolution",
        xaxis_title="Iteration",
        yaxis_title=metric.replace("_", " ").title(),
        height=300,
        hovermode="x unified",
    )
    return fig
--- a/src/entropice/dashboard/plots/metrics.py
+++ b/src/entropice/dashboard/plots/metrics.py
@ -0,0 +1,97 @@
 """Metrics visualization plots."""
 import numpy as np
 import plotly.graph_objects as go
 import xarray as xr
 def plot_confusion_matrix(cm_data: xr.DataArray, title: str = "Confusion Matrix", normalize: str = "none") -> go.Figure:
    """Plot an interactive confusion matrix heatmap.
    Args:
        cm_data: XArray DataArray with confusion matrix data (dimensions: true_label, predicted_label).
        title: Title for the plot.
        normalize: Normalization mode - "none", "true", or "pred".
    Returns:
        Plotly figure with the interactive confusion matrix heatmap.
    """
    # Get the data as numpy array
    cm_array = cm_data.values.astype(float)
    labels = cm_data.coords["true_label"].values.tolist()
    # Store original counts for display
    cm_counts = cm_data.values
    # Apply normalization
    if normalize == "true":
        # Normalize over true labels (rows) - each row sums to 1
        row_sums = cm_array.sum(axis=1, keepdims=True)
        cm_normalized = np.divide(cm_array, row_sums, where=row_sums != 0)
        colorbar_title = "Proportion"
    elif normalize == "pred":
        # Normalize over predicted labels (columns) - each column sums to 1
        col_sums = cm_array.sum(axis=0, keepdims=True)
        cm_normalized = np.divide(cm_array, col_sums, where=col_sums != 0)
        colorbar_title = "Proportion"
    else:
        # No normalization
        cm_normalized = cm_array
        colorbar_title = "Count"
    # Create annotations for the heatmap
    annotations = []
    for i, true_label in enumerate(labels):
        for j, pred_label in enumerate(labels):
            count = int(cm_counts[i, j])
            normalized_val = cm_normalized[i, j]
            # Format text based on normalization mode
            if normalize == "none":
                # Show count and percentage of total
                total = cm_counts.sum()
                pct = (count / total * 100) if total > 0 else 0
                text = f"{count}<br>({pct:.1f}%)"
            else:
                # Show percentage only for normalized versions
                text = f"{normalized_val:.1%}"
            # Determine text color based on normalized value
            threshold = cm_normalized.max() / 2 if cm_normalized.max() > 0 else 0.5
            text_color = "white" if normalized_val > threshold else "black"
            annotations.append(
                {
                    "x": pred_label,
                    "y": true_label,
                    "text": text,
                    "showarrow": False,
                    "font": {"size": 10, "color": text_color},
                }
            )
    # Create the heatmap with normalized values for coloring
    fig = go.Figure(
        data=go.Heatmap(
            z=cm_normalized,
            x=labels,
            y=labels,
            colorscale="Blues",
            colorbar={"title": colorbar_title},
            hoverongaps=False,
            hovertemplate="True: %{y}<br>Predicted: %{x}<br>Count: %{customdata}<extra></extra>",
            customdata=cm_counts,
        )
    )
    # Add annotations
    fig.update_layout(
        annotations=annotations,
        xaxis={"title": "Predicted Label", "side": "bottom"},
        yaxis={"title": "True Label", "autorange": "reversed"},
        width=600,
        height=550,
    )
    return fig
--- a/src/entropice/dashboard/plots/regression.py
+++ b/src/entropice/dashboard/plots/regression.py
@ -0,0 +1,180 @@
 """Regression analysis plotting functions."""
 from typing import cast
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
 from entropice.dashboard.utils.colors import get_palette
 def plot_regression_scatter(
    y_true: np.ndarray | pd.Series,
    y_pred: np.ndarray | pd.Series,
    title: str = "True vs Predicted",
 ) -> go.Figure:
    """Create scatter plot of true vs predicted values for regression.
    Args:
        y_true: True target values.
        y_pred: Predicted target values.
        title: Title for the plot.
    Returns:
        Plotly figure with regression scatter plot.
    """
    # Convert to numpy arrays if needed
    y_true_np = cast(np.ndarray, y_true.to_numpy()) if isinstance(y_true, pd.Series) else y_true
    y_pred_np = cast(np.ndarray, y_pred.to_numpy()) if isinstance(y_pred, pd.Series) else y_pred
    # Calculate metrics
    mse = np.mean((y_true_np - y_pred_np) ** 2)
    mae = np.mean(np.abs(y_true_np - y_pred_np))
    r2 = 1 - (np.sum((y_true_np - y_pred_np) ** 2) / np.sum((y_true_np - np.mean(y_true_np)) ** 2))
    # Get colormap
    hex_colors = get_palette("r2", n_colors=256)
    # Calculate point density for coloring
    from scipy.stats import gaussian_kde
    try:
        # Create KDE for density estimation
        xy = np.vstack([y_true_np, y_pred_np])
        kde = gaussian_kde(xy)
        density = kde(xy)
    except (np.linalg.LinAlgError, ValueError):
        # Fallback if KDE fails (e.g., all points identical)
        density = np.ones(len(y_true_np))
    # Create figure
    fig = go.Figure()
    # Add scatter plot
    fig.add_trace(
        go.Scatter(
            x=y_true_np,
            y=y_pred_np,
            mode="markers",
            marker={
                "size": 6,
                "color": density,
                "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
                "showscale": False,
                "opacity": 0.6,
            },
            text=[f"True: {true:.3f}<br>Pred: {pred:.3f}" for true, pred in zip(y_true_np, y_pred_np)],
            hovertemplate="%{text}<extra></extra>",
            name="Data",
        )
    )
    # Add diagonal line (perfect prediction)
    min_val = min(y_true_np.min(), y_pred_np.min())
    max_val = max(y_true_np.max(), y_pred_np.max())
    fig.add_trace(
        go.Scatter(
            x=[min_val, max_val],
            y=[min_val, max_val],
            mode="lines",
            line={"color": "red", "dash": "dash", "width": 2},
            name="Perfect Prediction",
            hovertemplate="y = x<extra></extra>",
        )
    )
    # Add metrics as annotation
    metrics_text = f"R² = {r2:.4f}<br>MSE = {mse:.4f}<br>MAE = {mae:.4f}"
    fig.add_annotation(
        x=0.02,
        y=0.98,
        xref="paper",
        yref="paper",
        text=metrics_text,
        showarrow=False,
        bgcolor="white",
        bordercolor="black",
        borderwidth=1,
        xanchor="left",
        yanchor="top",
        font={"size": 12},
    )
    fig.update_layout(
        title=title,
        xaxis_title="True Values",
        yaxis_title="Predicted Values",
        height=500,
        showlegend=True,
        legend={"x": 0.98, "y": 0.02, "xanchor": "right", "yanchor": "bottom"},
    )
    # Make axes equal
    fig.update_xaxes(scaleanchor="y", scaleratio=1)
    return fig
 def plot_residuals(
    y_true: np.ndarray | pd.Series,
    y_pred: np.ndarray | pd.Series,
    title: str = "Residual Plot",
 ) -> go.Figure:
    """Create residual plot for regression diagnostics.
    Args:
        y_true: True target values.
        y_pred: Predicted target values.
        title: Title for the plot.
    Returns:
        Plotly figure with residual plot.
    """
    # Convert to numpy arrays if needed
    y_true_np = cast(np.ndarray, y_true.to_numpy()) if isinstance(y_true, pd.Series) else y_true
    y_pred_np = cast(np.ndarray, y_pred.to_numpy()) if isinstance(y_pred, pd.Series) else y_pred
    # Calculate residuals
    residuals = y_true_np - y_pred_np
    # Get colormap
    hex_colors = get_palette("r2", n_colors=256)
    # Create figure
    fig = go.Figure()
    # Add scatter plot
    fig.add_trace(
        go.Scatter(
            x=y_pred,
            y=residuals,
            mode="markers",
            marker={
                "size": 6,
                "color": np.abs(residuals),
                "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
                "showscale": True,
                "colorbar": {"title": "Abs Residual"},
                "opacity": 0.6,
            },
            text=[f"Pred: {pred:.3f}<br>Residual: {res:.3f}" for pred, res in zip(y_pred, residuals)],
            hovertemplate="%{text}<extra></extra>",
        )
    )
    # Add zero line
    fig.add_hline(y=0, line_dash="dash", line_color="red", line_width=2)
    fig.update_layout(
        title=title,
        xaxis_title="Predicted Values",
        yaxis_title="Residuals (True - Predicted)",
        height=400,
        showlegend=False,
    )
    return fig
--- a/src/entropice/dashboard/sections/cv_result.py
+++ b/src/entropice/dashboard/sections/cv_result.py
@ -0,0 +1,185 @@
 """Training Result Sections."""
 import streamlit as st
 from entropice.dashboard.plots.metrics import plot_confusion_matrix
 from entropice.dashboard.utils.formatters import format_metric_name
 from entropice.dashboard.utils.loaders import TrainingResult
 from entropice.dashboard.utils.stats import CVMetricStatistics
 from entropice.utils.types import GridConfig
 def render_run_information(selected_result: TrainingResult, refit_metric):
    """Render training run configuration overview.
    Args:
        selected_result: The selected TrainingResult object.
        refit_metric: The refit metric used for model selection.
    """
    st.header("📋 Run Information")
    grid_config = GridConfig.from_grid_level(f"{selected_result.settings.grid}{selected_result.settings.level}")  # ty:ignore[invalid-argument-type]
    col1, col2, col3, col4, col5 = st.columns(5)
    with col1:
        st.metric("Task", selected_result.settings.task.capitalize())
    with col2:
        st.metric("Target", selected_result.settings.target.capitalize())
    with col3:
        st.metric("Grid", grid_config.display_name)
    with col4:
        st.metric("Model", selected_result.settings.model.upper())
    with col5:
        st.metric("Trials", len(selected_result.results))
    st.caption(f"**Refit Metric:** {format_metric_name(refit_metric)}")
 def _render_metrics(metrics: dict[str, float]):
    """Render a set of metrics in a two-column layout.
    Args:
        metrics: Dictionary of metric names and their values.
    """
    ncols = min(5, len(metrics))
    cols = st.columns(ncols)
    for idx, (metric_name, metric_value) in enumerate(metrics.items()):
        with cols[idx % ncols]:
            st.metric(format_metric_name(metric_name), f"{metric_value:.4f}")
 def render_metrics_section(selected_result: TrainingResult):
    """Render test metrics overview showing final model performance.
    Args:
        selected_result: The selected TrainingResult object.
    """
    # Test
    st.header("🎯 Test Set Performance")
    st.caption("Performance metrics on the held-out test set (best model from hyperparameter search)")
    _render_metrics(selected_result.test_metrics)
    # Train
    st.header("🏋️‍♂️ Training Set Performance")
    st.caption("Performance metrics on the training set (best model from hyperparameter search)")
    _render_metrics(selected_result.train_metrics)
    # Combined / All
    st.header("🧮 Overall Performance")
    st.caption("Overall performance metrics combining training and test sets")
    _render_metrics(selected_result.combined_metrics)
@st.fragment
 def render_confusion_matrices(selected_result: TrainingResult):
    """Render confusion matrices for classification tasks.
    Args:
        selected_result: The selected TrainingResult object.
    """
    st.header("🎭 Confusion Matrices")
    # Check if this is a classification task
    if selected_result.settings.task not in ["binary", "count_regimes", "density_regimes"]:
        st.info(
            "📊 Confusion matrices are only available for classification tasks "
            "(binary, count_regimes, density_regimes)."
        )
        st.caption("Coming soon for regression tasks: residual plots and error distributions.")
        return
    # Check if confusion matrix data is available
    if selected_result.confusion_matrix is None:
        st.warning("⚠️ No confusion matrix data found for this training result.")
        return
    cm = selected_result.confusion_matrix
    # Add normalization selection
    st.subheader("Display Options")
    normalize_option = st.radio(
        "Normalization",
        options=["No normalization", "Normalize over True Labels", "Normalize over Predicted Labels"],
        horizontal=True,
        help="Choose how to normalize the confusion matrix values",
    )
    # Map selection to normalization mode
    normalize_map = {
        "No normalization": "none",
        "Normalize over True Labels": "true",
        "Normalize over Predicted Labels": "pred",
    }
    normalize_mode = normalize_map[normalize_option]
    cols = st.columns(3)
    with cols[0]:
        # Test Set Confusion Matrix
        st.subheader("Test Set")
        st.caption("Held-out test set")
        fig_test = plot_confusion_matrix(cm["test"], title="Test Set", normalize=normalize_mode)
        st.plotly_chart(fig_test, width="stretch")
    with cols[1]:
        # Training Set Confusion Matrix
        st.subheader("Training Set")
        st.caption("Training set")
        fig_train = plot_confusion_matrix(cm["train"], title="Training Set", normalize=normalize_mode)
        st.plotly_chart(fig_train, width="stretch")
    with cols[2]:
        # Combined Confusion Matrix
        st.subheader("Combined")
        st.caption("Train + Test sets")
        fig_combined = plot_confusion_matrix(cm["combined"], title="Combined", normalize=normalize_mode)
        st.plotly_chart(fig_combined, width="stretch")
 def render_cv_statistics_section(cv_stats: CVMetricStatistics, test_score: float):
    """Render cross-validation statistics for selected metric.
    Args:
        cv_stats: CVMetricStatistics object containing cross-validation statistics.
        test_score: The test set score for the selected metric.
    """
    st.header("📈 Cross-Validation Statistics")
    st.caption("Performance during hyperparameter search (averaged across CV folds)")
    col1, col2, col3, col4, col5 = st.columns(5)
    with col1:
        st.metric("Best Score", f"{cv_stats.best_score:.4f}")
    with col2:
        st.metric("Mean Score", f"{cv_stats.mean_score:.4f}")
    with col3:
        st.metric("Std Dev", f"{cv_stats.std_score:.4f}")
    with col4:
        st.metric("Worst Score", f"{cv_stats.worst_score:.4f}")
    with col5:
        st.metric("Median Score", f"{cv_stats.median_score:.4f}")
    if cv_stats.mean_cv_std is not None:
        st.info(f"**Mean CV Std:** {cv_stats.mean_cv_std:.4f} - Average standard deviation across CV folds")
    # Compare with test metric
    st.subheader("CV vs Test Performance")
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Best CV Score", f"{cv_stats.best_score:.4f}")
    with col2:
        st.metric("Test Score", f"{test_score:.4f}")
    with col3:
        delta = test_score - cv_stats.best_score
        delta_pct = (delta / cv_stats.best_score * 100) if cv_stats.best_score != 0 else 0
        st.metric("Difference", f"{delta:+.4f}", delta=f"{delta_pct:+.2f}%")
    if abs(delta) > cv_stats.std_score:
        st.warning(
            "⚠️ Test performance differs significantly (larger than the CV standard deviation) from CV performance. "
            "This may indicate overfitting or data distribution mismatch between training and test sets."
        )
--- a/src/entropice/dashboard/sections/experiment_results.py
+++ b/src/entropice/dashboard/sections/experiment_results.py
@ -2,15 +2,16 @@
 from datetime import datetime
 import pandas as pd
 import streamlit as st
-from entropice.dashboard.utils.loaders import TrainingResult
+from entropice.dashboard.utils.loaders import AutogluonTrainingResult, TrainingResult
 from entropice.utils.types import (
    GridConfig,
 )
-def render_training_results_summary(training_results: list[TrainingResult]):
+def render_training_results_summary(training_results: list[TrainingResult | AutogluonTrainingResult]):
    """Render summary metrics for training results."""
    st.header("📊 Training Results Summary")
    col1, col2, col3, col4 = st.columns(4)
@ -23,7 +24,7 @@ def render_training_results_summary(training_results: list[TrainingResult]):
        st.metric("Total Runs", len(training_results))
    with col3:
-        models = {tr.settings.model for tr in training_results}
+        models = {tr.settings.model for tr in training_results if hasattr(tr.settings, "model")}
        st.metric("Model Types", len(models))
    with col4:
@ -33,14 +34,14 @@ def render_training_results_summary(training_results: list[TrainingResult]):
@st.fragment
-def render_experiment_results(training_results: list[TrainingResult]):  # noqa: C901
+def render_experiment_results(training_results: list[TrainingResult | AutogluonTrainingResult]):  # noqa: C901
    """Render detailed experiment results table and expandable details."""
    st.header("🎯 Experiment Results")
    # Filters
    experiments = sorted({tr.experiment for tr in training_results if tr.experiment})
    tasks = sorted({tr.settings.task for tr in training_results})
-    models = sorted({tr.settings.model for tr in training_results})
+    models = sorted({tr.settings.model if isinstance(tr, TrainingResult) else "autogluon" for tr in training_results})
    grids = sorted({f"{tr.settings.grid}-{tr.settings.level}" for tr in training_results})
    # Create filter columns
@ -87,14 +88,26 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
        filtered_results = [tr for tr in filtered_results if tr.experiment == selected_experiment]
    if selected_task != "All":
        filtered_results = [tr for tr in filtered_results if tr.settings.task == selected_task]
-    if selected_model != "All":
+    if selected_model != "All" and selected_model != "autogluon":
-        filtered_results = [tr for tr in filtered_results if tr.settings.model == selected_model]
+        filtered_results = [
            tr for tr in filtered_results if isinstance(tr, TrainingResult) and tr.settings.model == selected_model
        ]
    elif selected_model == "autogluon":
        filtered_results = [tr for tr in filtered_results if isinstance(tr, AutogluonTrainingResult)]
    if selected_grid != "All":
        filtered_results = [tr for tr in filtered_results if f"{tr.settings.grid}-{tr.settings.level}" == selected_grid]
    st.subheader("Results Table")
-    summary_df = TrainingResult.to_dataframe(filtered_results)
+    summary_df = TrainingResult.to_dataframe([tr for tr in filtered_results if isinstance(tr, TrainingResult)])
    autogluon_df = AutogluonTrainingResult.to_dataframe(
        [tr for tr in filtered_results if isinstance(tr, AutogluonTrainingResult)]
    )
    if len(summary_df) == 0:
        summary_df = autogluon_df
    elif len(autogluon_df) > 0:
        summary_df = pd.concat([summary_df, autogluon_df], ignore_index=True)
    # Display with color coding for best scores
    st.dataframe(
        summary_df,
@ -107,6 +120,8 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
        for tr in filtered_results:
            tr_info = tr.display_info
            display_name = tr_info.get_display_name("model_first")
            model = "autogluon" if isinstance(tr, AutogluonTrainingResult) else tr.settings.model
            cv_splits = tr.settings.cv_splits if hasattr(tr.settings, "cv_splits") else "N/A"
            with st.expander(display_name):
                col1, col2 = st.columns([1, 2])
@ -117,12 +132,12 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
                        f"- **Experiment:** {tr.experiment}\n"
                        f"- **Task:** {tr.settings.task}\n"
                        f"- **Target:** {tr.settings.target}\n"
-                        f"- **Model:** {tr.settings.model}\n"
+                        f"- **Model:** {model}\n"
                        f"- **Grid:** {grid_config.display_name}\n"
                        f"- **Created At:** {tr_info.timestamp.strftime('%Y-%m-%d %H:%M')}\n"
                        f"- **Temporal Mode:** {tr.settings.temporal_mode}\n"
                        f"- **Members:** {', '.join(tr.settings.members)}\n"
-                        f"- **CV Splits:** {tr.settings.cv_splits}\n"
+                        f"- **CV Splits:** {cv_splits}\n"
                        f"- **Classes:** {tr.settings.classes}\n"
                    )
@ -140,26 +155,29 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
                            file_str += f"- 📄 `{file.name}`\n"
                    st.write(file_str)
                with col2:
-                    st.write("**CV Score Summary:**")
+                    if isinstance(tr, AutogluonTrainingResult):
-
+                        st.write("**Leaderboard:**")
-                    # Extract all test scores
+                        st.dataframe(tr.leaderboard, width="stretch", hide_index=True)
                    metric_df = tr.get_metric_dataframe()
                    if metric_df is not None:
                        st.dataframe(metric_df, width="stretch", hide_index=True)
                    else:
-                        st.write("No test scores found in results.")
+                        st.write("**CV Score Summary:**")
                        # Extract all test scores
                        metric_df = tr.get_metric_dataframe()
                        if metric_df is not None:
                            st.dataframe(metric_df, width="stretch", hide_index=True)
                        else:
                            st.write("No test scores found in results.")
-                    # Show parameter space explored
+                        # Show parameter space explored
-                    if "initial_K" in tr.results.columns:  # Common parameter
+                        if "initial_K" in tr.results.columns:  # Common parameter
-                        st.write("\n**Parameter Ranges Explored:**")
+                            st.write("\n**Parameter Ranges Explored:**")
-                        for param in ["initial_K", "eps_cl", "eps_e"]:
+                            for param in ["initial_K", "eps_cl", "eps_e"]:
-                            if param in tr.results.columns:
+                                if param in tr.results.columns:
-                                min_val = tr.results[param].min()
+                                    min_val = tr.results[param].min()
-                                max_val = tr.results[param].max()
+                                    max_val = tr.results[param].max()
-                                unique_vals = tr.results[param].nunique()
+                                    unique_vals = tr.results[param].nunique()
-                                st.write(f"- **{param}:** {unique_vals} values ({min_val:.2e} to {max_val:.2e})")
+                                    st.write(f"- **{param}:** {unique_vals} values ({min_val:.2e} to {max_val:.2e})")
-                    st.write("**CV Results DataFrame:**")
+                        st.write("**CV Results DataFrame:**")
-                    st.dataframe(tr.results, width="stretch", hide_index=True)
+                        st.dataframe(tr.results, width="stretch", hide_index=True)
                st.write(f"\n**Path:** `{tr.path}`")
--- a/src/entropice/dashboard/sections/hparam_space.py
+++ b/src/entropice/dashboard/sections/hparam_space.py
@ -0,0 +1,172 @@
 """Hyperparameter Space Visualization Section."""
 import streamlit as st
 from entropice.dashboard.plots.hyperparameter_space import (
    plot_parameter_correlations,
    plot_parameter_distributions,
    plot_parameter_interactions,
    plot_score_evolution,
    plot_score_vs_parameters,
 )
 from entropice.dashboard.utils.formatters import format_metric_name
 from entropice.dashboard.utils.loaders import TrainingResult
 def _render_performance_summary(results, refit_metric: str):
    """Render performance summary subsection."""
    best_idx = results[f"mean_test_{refit_metric}"].idxmax()
    best_row = results.loc[best_idx]
    # Extract parameter columns
    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
    best_params = {col.replace("param_", ""): best_row[col] for col in param_cols}
    # Display best parameter combination
    if not best_params:
        return
    with st.container(border=True):
        st.subheader("🏆 Best Parameter Combination")
        st.caption(f"Parameters of the best model (selected by {format_metric_name(refit_metric)} score)")
        n_params = len(best_params)
        cols = st.columns(n_params)
        for idx, (param_name, param_value) in enumerate(best_params.items()):
            with cols[idx]:
                # Format value based on type and magnitude
                if isinstance(param_value, int):
                    formatted_value = f"{param_value:.0f}"
                elif isinstance(param_value, float):
                    # Use scientific notation for very small numbers
                    if abs(param_value) < 0.001 and param_value != 0:
                        formatted_value = f"{param_value:.2e}"
                    else:
                        formatted_value = f"{param_value:.4f}"
                else:
                    formatted_value = str(param_value)
                st.metric(param_name, formatted_value)
 def _render_parameter_distributions(results, param_grid: dict | None):
    """Render parameter distributions subsection."""
    st.subheader("Parameter Distributions")
    st.caption("Distribution of hyperparameter values explored during random search")
    param_charts = plot_parameter_distributions(results, param_grid)
    if not param_charts:
        st.info("No parameter distribution data available.")
        return
    # Display charts in a grid
    param_names = list(param_charts.keys())
    n_cols = min(3, len(param_names))
    n_rows = (len(param_names) + n_cols - 1) // n_cols
    for row in range(n_rows):
        cols = st.columns(n_cols)
        for col_idx in range(n_cols):
            param_idx = row * n_cols + col_idx
            if param_idx < len(param_names):
                param_name = param_names[param_idx]
                with cols[col_idx]:
                    st.plotly_chart(param_charts[param_name], width="stretch")
 def _render_score_evolution(results, selected_metric: str):
    """Render score evolution subsection."""
    st.subheader("Score Evolution Over Iterations")
    st.caption(f"How {format_metric_name(selected_metric)} evolved during the random search")
    evolution_chart = plot_score_evolution(results, selected_metric)
    if evolution_chart:
        st.plotly_chart(evolution_chart, width="stretch")
    else:
        st.warning(f"Score evolution not available for metric: {selected_metric}")
 def _render_score_vs_parameters(results, selected_metric: str, param_grid: dict | None):
    """Render score vs parameters subsection."""
    st.subheader("Score vs Individual Parameters")
    st.caption(f"Relationship between {format_metric_name(selected_metric)} and each hyperparameter")
    score_vs_param_charts = plot_score_vs_parameters(results, selected_metric, param_grid)
    if not score_vs_param_charts:
        st.info("No score vs parameter data available.")
        return
    param_names = list(score_vs_param_charts.keys())
    n_cols = min(2, len(param_names))
    n_rows = (len(param_names) + n_cols - 1) // n_cols
    for row in range(n_rows):
        cols = st.columns(n_cols)
        for col_idx in range(n_cols):
            param_idx = row * n_cols + col_idx
            if param_idx < len(param_names):
                param_name = param_names[param_idx]
                with cols[col_idx]:
                    st.plotly_chart(score_vs_param_charts[param_name], width="stretch")
 def _render_parameter_correlations(results, selected_metric: str):
    """Render parameter correlations subsection."""
    st.subheader("Parameter-Score Correlations")
    st.caption(f"Correlation between numeric parameters and {format_metric_name(selected_metric)}")
    corr_chart = plot_parameter_correlations(results, selected_metric)
    if corr_chart:
        st.plotly_chart(corr_chart, width="stretch")
    else:
        st.info("No numeric parameters found for correlation analysis.")
 def _render_parameter_interactions(results, selected_metric: str, param_grid: dict | None):
    """Render parameter interactions subsection."""
    st.subheader("Parameter Interactions")
    st.caption(f"Interaction between parameter pairs and their effect on {format_metric_name(selected_metric)}")
    interaction_charts = plot_parameter_interactions(results, selected_metric, param_grid)
    if not interaction_charts:
        st.info("Not enough numeric parameters for parameter interaction visualization.")
        return
    n_cols = min(2, len(interaction_charts))
    n_rows = (len(interaction_charts) + n_cols - 1) // n_cols
    for row in range(n_rows):
        cols = st.columns(n_cols)
        for col_idx in range(n_cols):
            chart_idx = row * n_cols + col_idx
            if chart_idx < len(interaction_charts):
                with cols[col_idx]:
                    st.plotly_chart(interaction_charts[chart_idx], width="stretch")
 def render_hparam_space_section(selected_result: TrainingResult, selected_metric: str):
    """Render the hyperparameter space visualization section.
    Args:
        selected_result: The selected TrainingResult object.
        selected_metric: The metric to focus analysis on.
    """
    st.header("🧩 Hyperparameter Space Exploration")
    results = selected_result.results
    refit_metric = selected_result._get_best_metric_name()
    param_grid = selected_result.settings.param_grid
    _render_performance_summary(results, refit_metric)
    _render_parameter_distributions(results, param_grid)
    _render_score_evolution(results, selected_metric)
    _render_score_vs_parameters(results, selected_metric, param_grid)
    _render_parameter_correlations(results, selected_metric)
    _render_parameter_interactions(results, selected_metric, param_grid)
--- a/src/entropice/dashboard/sections/regression_analysis.py
+++ b/src/entropice/dashboard/sections/regression_analysis.py
@ -0,0 +1,122 @@
 """Regression Analysis Section."""
 import streamlit as st
 from entropice.dashboard.plots.regression import plot_regression_scatter, plot_residuals
 from entropice.dashboard.utils.loaders import TrainingResult
 from entropice.ml.dataset import DatasetEnsemble
 def render_regression_analysis(selected_result: TrainingResult):
    """Render regression analysis with true vs predicted scatter plots.
    Args:
        selected_result: The selected TrainingResult object.
    """
    st.header("📊 Regression Analysis")
    # Check if this is a regression task
    if selected_result.settings.task in ["binary", "count_regimes", "density_regimes"]:
        st.info("📈 Regression analysis is only available for regression tasks (count, density).")
        return
    # Load predictions
    predictions_df = selected_result.load_predictions()
    if predictions_df is None:
        st.warning("⚠️ No prediction data found for this training result.")
        return
    # Create DatasetEnsemble from settings
    with st.spinner("Loading training data to get true values..."):
        ensemble = DatasetEnsemble(
            grid=selected_result.settings.grid,
            level=selected_result.settings.level,
            members=selected_result.settings.members,
            temporal_mode=selected_result.settings.temporal_mode,
            dimension_filters=selected_result.settings.dimension_filters,
            variable_filters=selected_result.settings.variable_filters,
            add_lonlat=selected_result.settings.add_lonlat,
        )
        # Create training set to get true values
        training_set = ensemble.create_training_set(
            task=selected_result.settings.task,
            target=selected_result.settings.target,
            device="cpu",
            cache_mode="read",
        )
    # Get split information
    split_series = training_set.split
    # Merge predictions with true values and split info
    # predictions_df should have 'cell_id' and 'predicted' columns
    # training_set.targets has 'y' (true values) with cell_id as index
    true_values = training_set.targets[["y"]].reset_index()
    # Merge on cell_id
    merged = predictions_df.merge(true_values, on="cell_id", how="inner")
    merged["split"] = split_series.reindex(merged["cell_id"]).values
    # Get train, test, and combined data
    train_data = merged[merged["split"] == "train"]
    test_data = merged[merged["split"] == "test"]
    if len(train_data) == 0 or len(test_data) == 0:
        st.error("❌ Could not properly split data into train and test sets.")
        return
    # Display scatter plots
    st.subheader("True vs Predicted Values")
    st.caption("Scatter plots showing the relationship between true and predicted values")
    cols = st.columns(3)
    with cols[0]:
        st.markdown("#### Test Set")
        st.caption("Held-out test set")
        fig_test = plot_regression_scatter(
            test_data["y"],
            test_data["predicted"],
            title="Test Set",
        )
        st.plotly_chart(fig_test, use_container_width=True)
    with cols[1]:
        st.markdown("#### Training Set")
        st.caption("Training set")
        fig_train = plot_regression_scatter(
            train_data["y"],
            train_data["predicted"],
            title="Training Set",
        )
        st.plotly_chart(fig_train, use_container_width=True)
    with cols[2]:
        st.markdown("#### Combined")
        st.caption("Train + Test sets")
        fig_combined = plot_regression_scatter(
            merged["y"],
            merged["predicted"],
            title="Combined",
        )
        st.plotly_chart(fig_combined, use_container_width=True)
    # Display residual plots
    st.subheader("Residual Analysis")
    st.caption("Residual plots to assess model fit and identify patterns in errors")
    cols = st.columns(3)
    with cols[0]:
        fig_test_res = plot_residuals(test_data["y"], test_data["predicted"], title="Test Set Residuals")
        st.plotly_chart(fig_test_res, use_container_width=True)
    with cols[1]:
        fig_train_res = plot_residuals(train_data["y"], train_data["predicted"], title="Training Set Residuals")
        st.plotly_chart(fig_train_res, use_container_width=True)
    with cols[2]:
        fig_combined_res = plot_residuals(merged["y"], merged["predicted"], title="Combined Residuals")
        st.plotly_chart(fig_combined_res, use_container_width=True)
--- a/src/entropice/dashboard/utils/class_ordering.py
+++ b/src/entropice/dashboard/utils/class_ordering.py
@ -1,70 +0,0 @@
 """Utilities for ordering predicted classes consistently across visualizations.
 This module leverages the canonical class labels defined in the ML dataset module
 to ensure consistent ordering across all visualizations.
 """
 import pandas as pd
 from entropice.utils.types import Task
 # Canonical orderings imported from the ML pipeline
 # Binary labels are defined inline in dataset.py: {False: "No RTS", True: "RTS"}
 # Count/Density labels are defined in the bin_values function
 BINARY_LABELS = ["No RTS", "RTS"]
 COUNT_LABELS = ["None", "Very Few", "Few", "Several", "Many", "Very Many"]
 DENSITY_LABELS = ["Empty", "Very Sparse", "Sparse", "Moderate", "Dense", "Very Dense"]
 CLASS_ORDERINGS: dict[Task | str, list[str]] = {
    "binary": BINARY_LABELS,
    "count": COUNT_LABELS,
    "density": DENSITY_LABELS,
 }
 def get_ordered_classes(task: Task | str, available_classes: list[str] | None = None) -> list[str]:
    """Get properly ordered class labels for a given task.
    This uses the same canonical ordering as defined in the ML dataset module,
    ensuring consistency between training and inference visualizations.
    Args:
        task: Task type ('binary', 'count', 'density').
        available_classes: Optional list of available classes to filter and order.
                          If None, returns all canonical classes for the task.
    Returns:
        List of class labels in proper order.
    Examples:
        >>> get_ordered_classes("binary")
        ['No RTS', 'RTS']
        >>> get_ordered_classes("count", ["None", "Few", "Several"])
        ['None', 'Few', 'Several']
    """
    canonical_order = CLASS_ORDERINGS[task]
    if available_classes is None:
        return canonical_order
    # Filter canonical order to only include available classes, preserving order
    return [cls for cls in canonical_order if cls in available_classes]
 def sort_class_series(series: pd.Series, task: Task | str) -> pd.Series:
    """Sort a pandas Series with class labels according to canonical ordering.
    Args:
        series: Pandas Series with class labels as index.
        task: Task type ('binary', 'count', 'density').
    Returns:
        Sorted Series with classes in canonical order.
    """
    available_classes = series.index.tolist()
    ordered_classes = get_ordered_classes(task, available_classes)
    # Reindex to get proper order
    return series.reindex(ordered_classes)
--- a/src/entropice/dashboard/utils/formatters.py
+++ b/src/entropice/dashboard/utils/formatters.py
@ -59,7 +59,7 @@ task_display_infos: dict[Task, TaskDisplayInfo] = {
 class TrainingResultDisplayInfo:
    task: Task
    target: TargetDataset
-    model: Model
+    model: Model | Literal["autogluon"]
    grid: Grid
    level: int
    timestamp: datetime
--- a/src/entropice/dashboard/utils/loaders.py
+++ b/src/entropice/dashboard/utils/loaders.py
@ -17,6 +17,7 @@ from shapely.geometry import shape
 import entropice.spatial.grids
 import entropice.utils.paths
 from entropice.dashboard.utils.formatters import TrainingResultDisplayInfo
 from entropice.ml.autogluon_training import AutoGluonTrainingSettings
 from entropice.ml.dataset import DatasetEnsemble, TrainingSet
 from entropice.ml.training import TrainingSettings
 from entropice.utils.types import GridConfig, TargetDataset, Task, all_target_datasets, all_tasks
@ -215,14 +216,18 @@ class TrainingResult:
        return pd.DataFrame.from_records(records)
-@st.cache_data
+@st.cache_data(ttl=300)  # Cache for 5 minutes
 def load_all_training_results() -> list[TrainingResult]:
    """Load all training results from the results directory."""
    results_dir = entropice.utils.paths.RESULTS_DIR
    training_results: list[TrainingResult] = []
    incomplete_results: list[tuple[Path, Exception]] = []
    for result_path in results_dir.iterdir():
        if not result_path.is_dir():
            continue
        # Skip AutoGluon results directory
        if "autogluon" in result_path.name.lower():
            continue
        try:
            training_result = TrainingResult.from_path(result_path)
            training_results.append(training_result)
@ -237,10 +242,159 @@ def load_all_training_results() -> list[TrainingResult]:
                    training_results.append(training_result)
                    is_experiment_dir = True
                except FileNotFoundError as e2:
-                    st.warning(f"Skipping incomplete training result: {e2}")
+                    incomplete_results.append((experiment_path, e2))
            if not is_experiment_dir:
-                st.warning(f"Skipping incomplete training result: {e}")
+                incomplete_results.append((result_path, e))
    if len(incomplete_results) > 0:
        st.warning(
            f"Found {len(incomplete_results)} incomplete training results that were skipped:\n - "
            + "\n - ".join(f"{p}: {e}" for p, e in incomplete_results)
        )
    # Sort by creation time (most recent first)
    training_results.sort(key=lambda tr: tr.created_at, reverse=True)
    return training_results
@dataclass
 class AutogluonTrainingResult:
    """Wrapper for training result data and metadata."""
    path: Path
    experiment: str
    settings: AutoGluonTrainingSettings
    test_metrics: dict[str, float | dict | pd.DataFrame]
    leaderboard: pd.DataFrame
    feature_importance: pd.DataFrame | None
    created_at: float
    files: list[Path]
    @classmethod
    def from_path(cls, result_path: Path, experiment_name: str | None = None) -> "AutogluonTrainingResult":
        """Load an AutogluonTrainingResult from a given result directory path."""
        settings_file = result_path / "training_settings.toml"
        metrics_file = result_path / "test_metrics.pickle"
        leaderboard_file = result_path / "leaderboard.parquet"
        feature_importance_file = result_path / "feature_importance.parquet"
        all_files = list(result_path.iterdir())
        if not settings_file.exists():
            raise FileNotFoundError(f"Missing settings file in {result_path}")
        if not metrics_file.exists():
            raise FileNotFoundError(f"Missing metrics file in {result_path}")
        if not leaderboard_file.exists():
            raise FileNotFoundError(f"Missing leaderboard file in {result_path}")
        created_at = result_path.stat().st_ctime
        settings_dict = toml.load(settings_file)["settings"]
        settings = AutoGluonTrainingSettings(**settings_dict)
        with open(metrics_file, "rb") as f:
            metrics = pickle.load(f)
        leaderboard = pd.read_parquet(leaderboard_file)
        if feature_importance_file.exists():
            feature_importance = pd.read_parquet(feature_importance_file)
        else:
            feature_importance = None
        return cls(
            path=result_path,
            experiment=experiment_name or "N/A",
            settings=settings,
            test_metrics=metrics,
            leaderboard=leaderboard,
            feature_importance=feature_importance,
            created_at=created_at,
            files=all_files,
        )
    @property
    def test_confusion_matrix(self) -> pd.DataFrame | None:
        """Get the test confusion matrix."""
        if "confusion_matrix" not in self.test_metrics:
            return None
        assert isinstance(self.test_metrics["confusion_matrix"], pd.DataFrame)
        return self.test_metrics["confusion_matrix"]
    @property
    def display_info(self) -> TrainingResultDisplayInfo:
        """Get display information for the training result."""
        return TrainingResultDisplayInfo(
            task=self.settings.task,
            target=self.settings.target,
            model="autogluon",
            grid=self.settings.grid,
            level=self.settings.level,
            timestamp=datetime.fromtimestamp(self.created_at),
        )
    def _get_best_metric_name(self) -> str:
        """Get the primary metric name for a given task."""
        match self.settings.task:
            case "binary":
                return "f1"
            case "count_regimes" | "density_regimes":
                return "f1_weighted"
            case _:  # regression tasks
                return "root_mean_squared_error"
    @staticmethod
    def to_dataframe(training_results: list["AutogluonTrainingResult"]) -> pd.DataFrame:
        """Convert a list of AutogluonTrainingResult objects to a DataFrame for display."""
        records = []
        for tr in training_results:
            info = tr.display_info
            best_metric_name = tr._get_best_metric_name()
            record = {
                "Experiment": tr.experiment if tr.experiment else "N/A",
                "Task": info.task,
                "Target": info.target,
                "Model": info.model,
                "Grid": GridConfig.from_grid_level((info.grid, info.level)).display_name,
                "Created At": info.timestamp.strftime("%Y-%m-%d %H:%M"),
                "Score-Metric": best_metric_name.title(),
                "Best Models Score (Test-Set)": tr.test_metrics.get(best_metric_name),
                "Path": str(tr.path.name),
            }
            records.append(record)
        return pd.DataFrame.from_records(records)
@st.cache_data(ttl=300)  # Cache for 5 minutes
 def load_all_autogluon_training_results() -> list[AutogluonTrainingResult]:
    """Load all training results from the results directory."""
    results_dir = entropice.utils.paths.RESULTS_DIR
    training_results: list[AutogluonTrainingResult] = []
    incomplete_results: list[tuple[Path, Exception]] = []
    for result_path in results_dir.iterdir():
        if not result_path.is_dir():
            continue
        # Skip AutoGluon results directory
        if "autogluon" not in result_path.name.lower():
            continue
        try:
            training_result = AutogluonTrainingResult.from_path(result_path)
            training_results.append(training_result)
        except FileNotFoundError as e:
            is_experiment_dir = False
            for experiment_path in result_path.iterdir():
                if not experiment_path.is_dir():
                    continue
                try:
                    experiment_name = experiment_path.parent.name
                    training_result = AutogluonTrainingResult.from_path(experiment_path, experiment_name)
                    training_results.append(training_result)
                    is_experiment_dir = True
                except FileNotFoundError as e2:
                    incomplete_results.append((experiment_path, e2))
            if not is_experiment_dir:
                incomplete_results.append((result_path, e))
    if len(incomplete_results) > 0:
        st.warning(
            f"Found {len(incomplete_results)} incomplete autogluon training results that were skipped:\n - "
            + "\n - ".join(f"{p}: {e}" for p, e in incomplete_results)
        )
    # Sort by creation time (most recent first)
    training_results.sort(key=lambda tr: tr.created_at, reverse=True)
    return training_results
--- a/src/entropice/dashboard/views/model_state_page.py
+++ b/src/entropice/dashboard/views/model_state_page.py
@ -369,6 +369,7 @@ def render_xgboost_model_state(model_state: xr.Dataset, selected_result: Trainin
        options=["gain", "weight", "cover", "total_gain", "total_cover"],
        index=0,
        help="Choose which importance metric to visualize",
        key="model_state_importance_type",
    )
    # Top N slider
--- a/src/entropice/dashboard/views/overview_page.py
+++ b/src/entropice/dashboard/views/overview_page.py
@ -9,7 +9,7 @@ from entropice.dashboard.sections.experiment_results import (
    render_training_results_summary,
 )
 from entropice.dashboard.sections.storage_statistics import render_storage_statistics
-from entropice.dashboard.utils.loaders import load_all_training_results
+from entropice.dashboard.utils.loaders import load_all_autogluon_training_results, load_all_training_results
 from entropice.dashboard.utils.stats import DatasetStatistics, load_all_default_dataset_statistics
@ -27,6 +27,9 @@ def render_overview_page():
    )
    # Load training results
    training_results = load_all_training_results()
    autogluon_results = load_all_autogluon_training_results()
    if len(autogluon_results) > 0:
        training_results.extend(autogluon_results)
    if not training_results:
        st.warning("No training results found. Please run some training experiments first.")
--- a/src/entropice/dashboard/views/training_analysis_page.py
+++ b/src/entropice/dashboard/views/training_analysis_page.py
@ -2,150 +2,22 @@
 from typing import cast
 import geopandas as gpd
 import streamlit as st
 import xarray as xr
 from stopuhr import stopwatch
-from entropice.dashboard.plots.hyperparameter_analysis import (
+from entropice.dashboard.sections.cv_result import (
-    render_binned_parameter_space,
+    render_confusion_matrices,
-    render_confusion_matrix_heatmap,
+    render_cv_statistics_section,
-    render_confusion_matrix_map,
+    render_metrics_section,
-    render_espa_binned_parameter_space,
+    render_run_information,
    render_multi_metric_comparison,
    render_parameter_correlation,
    render_parameter_distributions,
    render_performance_summary,
    render_top_configurations,
 )
 from entropice.dashboard.sections.hparam_space import render_hparam_space_section
 from entropice.dashboard.sections.regression_analysis import render_regression_analysis
 from entropice.dashboard.utils.formatters import format_metric_name
 from entropice.dashboard.utils.loaders import TrainingResult, load_all_training_results
-from entropice.dashboard.utils.stats import CVResultsStatistics
+from entropice.dashboard.utils.stats import CVMetricStatistics
 from entropice.utils.types import GridConfig
-def load_predictions_with_labels(selected_result: TrainingResult) -> gpd.GeoDataFrame | None:
+def render_analysis_settings_sidebar(training_results: list[TrainingResult]) -> tuple[TrainingResult, str, str]:
    """Load predictions and merge with training data to get true labels and split info.
    Args:
        selected_result: The selected TrainingResult object.
    Returns:
        GeoDataFrame with predictions, true labels, and split information, or None if unavailable.
    """
    from sklearn.model_selection import train_test_split
    from entropice.ml.dataset import DatasetEnsemble, bin_values, taskcol
    # Load predictions
    preds_gdf = selected_result.load_predictions()
    if preds_gdf is None:
        return None
    # Create a minimal dataset ensemble to access target data
    settings = selected_result.settings
    dataset_ensemble = DatasetEnsemble(
        grid=settings.grid,
        level=settings.level,
        target=settings.target,
        members=[],  # No feature data needed, just targets
    )
    # Load target dataset (just labels, no features)
    with st.spinner("Loading target labels..."):
        targets = dataset_ensemble._read_target()
    # Get coverage and task columns
    task_col = taskcol[settings.task][settings.target]
    # Filter for valid labels (same as in _cat_and_split)
    valid_labels = targets[task_col].notna()
    filtered_targets = targets.loc[valid_labels].copy()
    # Apply binning to get class labels (same logic as _cat_and_split)
    if settings.task == "binary":
        binned = filtered_targets[task_col].map({False: "No RTS", True: "RTS"}).astype("category")
    elif settings.task == "count":
        binned = bin_values(filtered_targets[task_col].astype(int), task=settings.task)
    elif settings.task == "density":
        binned = bin_values(filtered_targets[task_col], task=settings.task)
    else:
        raise ValueError(f"Invalid task: {settings.task}")
    filtered_targets["true_class"] = binned.to_numpy()
    # Recreate the train/test split deterministically (same random_state=42 as in _cat_and_split)
    _train_idx, test_idx = train_test_split(
        filtered_targets.index.to_numpy(), test_size=0.2, random_state=42, shuffle=True
    )
    filtered_targets["split"] = "train"
    filtered_targets.loc[test_idx, "split"] = "test"
    filtered_targets["split"] = filtered_targets["split"].astype("category")
    # Ensure cell_id is available as a column for merging
    # Check if cell_id already exists, otherwise use the index
    if "cell_id" not in filtered_targets.columns:
        filtered_targets = filtered_targets.reset_index().rename(columns={"index": "cell_id"})
    # Merge predictions with labels (inner join to keep only cells with predictions)
    merged = filtered_targets.merge(preds_gdf[["cell_id", "predicted_class"]], on="cell_id", how="inner")
    merged_gdf = gpd.GeoDataFrame(merged, geometry="geometry", crs=targets.crs)
    return merged_gdf
 def compute_confusion_matrix_from_merged_data(
    merged_data: gpd.GeoDataFrame,
    split_type: str,
    label_names: list[str],
 ) -> xr.DataArray | None:
    """Compute confusion matrix from merged predictions and labels.
    Args:
        merged_data: GeoDataFrame with 'true_class', 'predicted_class', and 'split' columns.
        split_type: One of 'test', 'train', or 'all'.
        label_names: List of class label names in order.
    Returns:
        xarray.DataArray with confusion matrix or None if data unavailable.
    """
    from sklearn.metrics import confusion_matrix
    # Filter by split type
    if split_type == "train":
        data = merged_data[merged_data["split"] == "train"]
    elif split_type == "test":
        data = merged_data[merged_data["split"] == "test"]
    elif split_type == "all":
        data = merged_data
    else:
        raise ValueError(f"Invalid split_type: {split_type}")
    if len(data) == 0:
        st.warning(f"No data available for {split_type} split.")
        return None
    # Get true and predicted labels
    y_true = data["true_class"].to_numpy()
    y_pred = data["predicted_class"].to_numpy()
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=label_names)
    # Create xarray DataArray
    cm_xr = xr.DataArray(
        cm,
        dims=["true_label", "predicted_label"],
        coords={"true_label": label_names, "predicted_label": label_names},
        name="confusion_matrix",
    )
    return cm_xr
 def render_analysis_settings_sidebar(training_results: list[TrainingResult]) -> tuple[TrainingResult, str, str, int]:
    """Render sidebar for training run and analysis settings selection.
    Args:
@ -155,351 +27,63 @@ def render_analysis_settings_sidebar(training_results: list[TrainingResult]) ->
        Tuple of (selected_result, selected_metric, refit_metric, top_n).
    """
-    st.header("Select Training Run")
+    with st.sidebar.form("training_analysis_settings_form"):
        st.header("Select Training Run")
-    # Create selection options with task-first naming
+        # Create selection options with task-first naming
-    training_options = {tr.display_info.get_display_name("task_first"): tr for tr in training_results}
+        training_options = {tr.display_info.get_display_name("task_first"): tr for tr in training_results}
-    selected_name = st.selectbox(
+        selected_name = st.selectbox(
-        "Training Run",
+            "Training Run",
-        options=list(training_options.keys()),
+            options=list(training_options.keys()),
-        index=0,
+            index=0,
-        help="Select a training run to analyze",
+            help="Select a training run to analyze",
-        key="training_run_select",
+            key="training_run_select",
-    )
+        )
-    selected_result = cast(TrainingResult, training_options[selected_name])
+        selected_result = cast(TrainingResult, training_options[selected_name])
    st.divider()
    # Metric selection for detailed analysis
    st.subheader("Analysis Settings")
    available_metrics = selected_result.available_metrics
    # Try to get refit metric from settings
    refit_metric = "f1" if selected_result.settings.task == "binary" else "f1_weighted"
    if refit_metric in available_metrics:
        default_metric_idx = available_metrics.index(refit_metric)
    else:
        default_metric_idx = 0
    selected_metric = st.selectbox(
        "Primary Metric for Analysis",
        options=available_metrics,
        index=default_metric_idx,
        format_func=format_metric_name,
        help="Select the metric to focus on for detailed analysis",
        key="metric_select",
    )
    # Top N configurations
    top_n = st.slider(
        "Top N Configurations",
        min_value=5,
        max_value=50,
        value=10,
        step=5,
        help="Number of top configurations to display",
        key="top_n_slider",
    )
    return selected_result, selected_metric, refit_metric, top_n
 def render_run_information(selected_result: TrainingResult, refit_metric):
    """Render training run configuration overview.
    Args:
        selected_result: The selected TrainingResult object.
        refit_metric: The refit metric used for model selection.
    """
    st.header("📋 Run Information")
    grid_config = GridConfig.from_grid_level(f"{selected_result.settings.grid}{selected_result.settings.level}")  # ty:ignore[invalid-argument-type]
    col1, col2, col3, col4, col5 = st.columns(5)
    with col1:
        st.metric("Task", selected_result.settings.task.capitalize())
    with col2:
        st.metric("Target", selected_result.settings.target.capitalize())
    with col3:
        st.metric("Grid", grid_config.display_name)
    with col4:
        st.metric("Model", selected_result.settings.model.upper())
    with col5:
        st.metric("Trials", len(selected_result.results))
    st.caption(f"**Refit Metric:** {format_metric_name(refit_metric)}")
 def render_test_metrics_section(selected_result: TrainingResult):
    """Render test metrics overview showing final model performance.
    Args:
        selected_result: The selected TrainingResult object.
    """
    st.header("🎯 Test Set Performance")
    st.caption("Performance metrics on the held-out test set (best model from hyperparameter search)")
    test_metrics = selected_result.metrics
    if not test_metrics:
        st.warning("No test metrics available for this training run.")
        return
    # Display metrics in columns based on task type
    task = selected_result.settings.task
    if task == "binary":
        # Binary classification metrics
        col1, col2, col3, col4, col5 = st.columns(5)
        with col1:
            st.metric("Accuracy", f"{test_metrics.get('accuracy', 0):.4f}")
        with col2:
            st.metric("F1 Score", f"{test_metrics.get('f1', 0):.4f}")
        with col3:
            st.metric("Precision", f"{test_metrics.get('precision', 0):.4f}")
        with col4:
            st.metric("Recall", f"{test_metrics.get('recall', 0):.4f}")
        with col5:
            st.metric("Jaccard", f"{test_metrics.get('jaccard', 0):.4f}")
    else:
        # Multiclass metrics
        col1, col2, col3 = st.columns(3)
        with col1:
            st.metric("Accuracy", f"{test_metrics.get('accuracy', 0):.4f}")
        with col2:
            st.metric("F1 (Macro)", f"{test_metrics.get('f1_macro', 0):.4f}")
        with col3:
            st.metric("F1 (Weighted)", f"{test_metrics.get('f1_weighted', 0):.4f}")
        col4, col5, col6 = st.columns(3)
        with col4:
            st.metric("Precision (Macro)", f"{test_metrics.get('precision_macro', 0):.4f}")
        with col5:
            st.metric("Precision (Weighted)", f"{test_metrics.get('precision_weighted', 0):.4f}")
        with col6:
            st.metric("Recall (Macro)", f"{test_metrics.get('recall_macro', 0):.4f}")
        col7, col8, col9 = st.columns(3)
        with col7:
            st.metric("Jaccard (Micro)", f"{test_metrics.get('jaccard_micro', 0):.4f}")
        with col8:
            st.metric("Jaccard (Macro)", f"{test_metrics.get('jaccard_macro', 0):.4f}")
        with col9:
            st.metric("Jaccard (Weighted)", f"{test_metrics.get('jaccard_weighted', 0):.4f}")
 def render_cv_statistics_section(selected_result, selected_metric):
    """Render cross-validation statistics for selected metric.
    Args:
        selected_result: The selected TrainingResult object.
        selected_metric: The metric to display statistics for.
    """
    st.header("📈 Cross-Validation Statistics")
    st.caption("Performance during hyperparameter search (averaged across CV folds)")
    from entropice.dashboard.utils.stats import CVMetricStatistics
    cv_stats = CVMetricStatistics.compute(selected_result, selected_metric)
    col1, col2, col3, col4, col5 = st.columns(5)
    with col1:
        st.metric("Best Score", f"{cv_stats.best_score:.4f}")
    with col2:
        st.metric("Mean Score", f"{cv_stats.mean_score:.4f}")
    with col3:
        st.metric("Std Dev", f"{cv_stats.std_score:.4f}")
    with col4:
        st.metric("Worst Score", f"{cv_stats.worst_score:.4f}")
    with col5:
        st.metric("Median Score", f"{cv_stats.median_score:.4f}")
    if cv_stats.mean_cv_std is not None:
        st.info(f"**Mean CV Std:** {cv_stats.mean_cv_std:.4f} - Average standard deviation across CV folds")
    # Compare with test metric if available
    if selected_metric in selected_result.metrics:
        test_score = selected_result.metrics[selected_metric]
        st.divider()
        st.subheader("CV vs Test Performance")
-        col1, col2, col3 = st.columns(3)
+        # Metric selection for detailed analysis
-        with col1:
+        st.subheader("Analysis Settings")
            st.metric("Best CV Score", f"{cv_stats.best_score:.4f}")
        with col2:
            st.metric("Test Score", f"{test_score:.4f}")
        with col3:
            delta = test_score - cv_stats.best_score
            delta_pct = (delta / cv_stats.best_score * 100) if cv_stats.best_score != 0 else 0
            st.metric("Difference", f"{delta:+.4f}", delta=f"{delta_pct:+.2f}%")
-        if abs(delta) > cv_stats.std_score:
+        available_metrics = selected_result.available_metrics
            st.warning(
                "⚠️ Test performance differs significantly from CV performance. "
                "This may indicate overfitting or data distribution mismatch."
            )
-
+        # Try to get refit metric from settings
-@st.fragment
+        if selected_result.settings.task == "binary":
-def render_confusion_matrix_section(selected_result: TrainingResult, merged_predictions: gpd.GeoDataFrame | None):
+            refit_metric = "f1"
-    """Render confusion matrix visualization and analysis.
+        elif selected_result.settings.task in ["count_regimes", "density_regimes"]:
-
+            refit_metric = "f1_weighted"
    Args:
        selected_result: The selected TrainingResult object.
        merged_predictions: GeoDataFrame with predictions merged with true labels and split info.
    """
    st.header("🎲 Confusion Matrix")
    st.caption("Detailed breakdown of predictions")
    # Add selector for confusion matrix type
    cm_type = st.selectbox(
        "Select Data Split",
        options=["test", "train", "all"],
        format_func=lambda x: {"test": "Test Set", "train": "CV Set (Train Split)", "all": "All Available Data"}[x],
        help="Choose which data split to display the confusion matrix for",
        key="cm_split_select",
    )
    # Get label names from settings
    label_names = selected_result.settings.classes
    # Compute or load confusion matrix based on selection
    if cm_type == "test":
        if selected_result.confusion_matrix is None:
            st.warning("No confusion matrix available for the test set.")
            return
        cm = selected_result.confusion_matrix
        st.info("📊 Showing confusion matrix for the **Test Set** (held-out data, never used during training)")
    else:
        if merged_predictions is None:
            st.warning("Predictions data not available. Cannot compute confusion matrix.")
            return
        with st.spinner(f"Computing confusion matrix for {cm_type} split..."):
            cm = compute_confusion_matrix_from_merged_data(merged_predictions, cm_type, label_names)
        if cm is None:
            return
        if cm_type == "train":
            st.info(
                "📊 Showing confusion matrix for the **CV Set (Train Split)** "
                "(data used during hyperparameter search cross-validation)"
            )
        else:  # all
            st.info("📊 Showing confusion matrix for **All Available Data** (combined train and test splits)")
    render_confusion_matrix_heatmap(cm, selected_result.settings.task)
 def render_parameter_space_section(selected_result, selected_metric):
    """Render parameter space analysis section.
    Args:
        selected_result: The selected TrainingResult object.
        selected_metric: The metric to analyze parameters against.
    """
    st.header("🔍 Parameter Space Analysis")
    # Compute CV results statistics
    cv_results_stats = CVResultsStatistics.compute(selected_result)
    # Show parameter space summary
    with st.expander("📋 Parameter Space Summary", expanded=False):
        param_summary_df = cv_results_stats.parameters_to_dataframe()
        if not param_summary_df.empty:
            st.dataframe(param_summary_df, hide_index=True, width="stretch")
        else:
-            st.info("No parameter information available.")
+            refit_metric = "r2"
-    results = selected_result.results
+        if refit_metric in available_metrics:
-    settings = selected_result.settings
+            default_metric_idx = available_metrics.index(refit_metric)
        else:
            default_metric_idx = 0
-    # Parameter distributions
+        selected_metric = st.selectbox(
-    st.subheader("📈 Parameter Distributions")
+            "Primary Metric for Analysis",
-    render_parameter_distributions(results, settings)
+            options=available_metrics,
            index=default_metric_idx,
            format_func=format_metric_name,
            help="Select the metric to focus on for detailed analysis",
            key="metric_select",
        )
-    # Binned parameter space plots
+        # Form submit button
-    st.subheader("🎨 Binned Parameter Space")
+        submitted = st.form_submit_button(
            "Load Training Result",
            type="primary",
            use_container_width=True,
        )
-    # Check if this is an ESPA model and show ESPA-specific plots
+        if not submitted:
-    model_type = settings.model
+            st.info("👆 Click 'Load Training Result' to apply changes.")
-    if model_type == "espa":
+            st.stop()
        # Show ESPA-specific binned plots (eps_cl vs eps_e binned by K)
        render_espa_binned_parameter_space(results, selected_metric)
-        # Optionally show the generic binned plots in an expander
+    return selected_result, selected_metric, refit_metric
        with st.expander("📊 All Parameter Combinations", expanded=False):
            st.caption("Generic parameter space exploration (all pairwise combinations)")
            render_binned_parameter_space(results, selected_metric)
    else:
        # For non-ESPA models, show the generic binned plots
        render_binned_parameter_space(results, selected_metric)
 def render_data_export_section(results, selected_result):
    """Render data export section with download buttons.
    Args:
        results: DataFrame with CV results.
        selected_result: The selected TrainingResult object.
    """
    with st.expander("💾 Export Data", expanded=False):
        st.subheader("Download Results")
        col1, col2 = st.columns(2)
        with col1:
            # Download full results as CSV
            csv_data = results.to_csv(index=False)
            st.download_button(
                label="📥 Download Full Results (CSV)",
                data=csv_data,
                file_name=f"{selected_result.path.name}_results.csv",
                mime="text/csv",
            )
        with col2:
            # Download settings as JSON
            import json
            settings_dict = {
                "task": selected_result.settings.task,
                "grid": selected_result.settings.grid,
                "level": selected_result.settings.level,
                "model": selected_result.settings.model,
                "cv_splits": selected_result.settings.cv_splits,
                "classes": selected_result.settings.classes,
            }
            settings_json = json.dumps(settings_dict, indent=2)
            st.download_button(
                label="⚙️ Download Settings (JSON)",
                data=settings_json,
                file_name=f"{selected_result.path.name}_settings.json",
                mime="application/json",
            )
        # Show raw data preview
        st.subheader("Raw Data Preview")
        st.dataframe(results.head(100), width="stretch")
 def render_training_analysis_page():
@ -513,91 +97,47 @@ def render_training_analysis_page():
        """
    )
-    # Load all available training results
+    # Load training results
    training_results = load_all_training_results()
    if not training_results:
        st.warning("No training results found. Please run some training experiments first.")
-        st.info("Run training using: `pixi run python -m entropice.ml.training`")
+        st.stop()
        return
-    st.success(f"Found **{len(training_results)}** training result(s)")
+    st.write(f"Found **{len(training_results)}** training result(s)")
    st.divider()
    selected_result, selected_metric, refit_metric = render_analysis_settings_sidebar(training_results)
-    # Sidebar: Training run selection
+    cv_statistics = CVMetricStatistics.compute(selected_result, selected_metric)
    with st.sidebar:
        selection_result = render_analysis_settings_sidebar(training_results)
        if selection_result[0] is None:
            return
        selected_result, selected_metric, refit_metric, top_n = selection_result
    # Load predictions with labels once (used by confusion matrix and map)
    merged_predictions = load_predictions_with_labels(selected_result)
    # Main content area
    results = selected_result.results
    settings = selected_result.settings
    # Run Information
    render_run_information(selected_result, refit_metric)
    st.divider()
-    # Test Metrics Section
+    render_metrics_section(selected_result)
    render_test_metrics_section(selected_result)
    st.divider()
-    # Confusion Matrix Section
+    # Render confusion matrices for classification, regression analysis for regression
-    render_confusion_matrix_section(selected_result, merged_predictions)
+    if selected_result.settings.task in ["binary", "count_regimes", "density_regimes"]:
        render_confusion_matrices(selected_result)
    else:
        render_regression_analysis(selected_result)
    st.divider()
-    # Performance Summary Section
+    render_cv_statistics_section(cv_statistics, selected_result.test_metrics.get(selected_metric, float("nan")))
    st.header("📊 CV Performance Overview")
    st.caption("Summary of hyperparameter search results across all configurations")
    render_performance_summary(results, refit_metric)
    st.divider()
-    # Prediction Analysis Map Section
+    render_hparam_space_section(selected_result, selected_metric)
    st.header("🗺️ Model Performance Map")
    st.caption("Interactive 3D map showing prediction correctness across the training dataset")
    render_confusion_matrix_map(selected_result.path, settings, merged_predictions)
    st.divider()
-    # Cross-Validation Statistics
+    # List all results at the end
-    render_cv_statistics_section(selected_result, selected_metric)
+    st.header("📄 All Training Results")
-
+    st.dataframe(selected_result.results)
    st.divider()
    # Parameter Space Analysis
    render_parameter_space_section(selected_result, selected_metric)
    st.divider()
    # Parameter Correlation
    st.header("🔗 Parameter Correlation")
    render_parameter_correlation(results, selected_metric)
    st.divider()
    # Multi-Metric Comparison
    if len(selected_result.available_metrics) >= 2:
        st.header("📊 Multi-Metric Comparison")
        render_multi_metric_comparison(results)
        st.divider()
    # Top Configurations
    st.header("🏆 Top Performing Configurations")
    render_top_configurations(results, selected_metric, top_n)
    st.divider()
    # Raw Data Export
    render_data_export_section(results, selected_result)
    st.balloons()
    stopwatch.summary()
--- a/src/entropice/ml/autogluon_training.py
+++ b/src/entropice/ml/autogluon_training.py
@ -44,8 +44,8 @@ class AutoGluonSettings:
 class AutoGluonTrainingSettings(DatasetEnsemble, AutoGluonSettings):
    """Combined settings for AutoGluon training."""
-    classes: list[str] | None
+    classes: list[str] | None = None
-    problem_type: str
+    problem_type: str = "binary"
 def _determine_problem_type_and_metric(task: Task) -> tuple[str, str]:
@ -177,6 +177,8 @@ def autogluon_train(
        toml.dump({"settings": asdict(combined_settings)}, f)
    # Save test metrics
    # We need to use pickle here, because the confusion matrix is stored as a dataframe
    # This only matters for classification tasks
    test_metrics_file = results_dir / "test_metrics.pickle"
    print(f"💾 Saving test metrics to {test_metrics_file}")
    with open(test_metrics_file, "wb") as f: