Redo Training Resutls Analysis

2026-01-19 16:35:38 +01:00 · 2026-01-19 16:35:38 +01:00 · 7d874f7f92
commit 7d874f7f92
parent 2664579a75
16 changed files with 1455 additions and 2227 deletions
--- a/src/entropice/dashboard/app.py
+++ b/src/entropice/dashboard/app.py
@ -12,7 +12,6 @@ Pages:

 import streamlit as st

-from entropice.dashboard.views.autogluon_analysis_page import render_autogluon_analysis_page
 from entropice.dashboard.views.dataset_page import render_dataset_page
 from entropice.dashboard.views.inference_page import render_inference_page
 from entropice.dashboard.views.model_state_page import render_model_state_page
@ -28,7 +27,6 @@ def main():
    overview_page = st.Page(render_overview_page, title="Overview", icon="🏡", default=True)
    data_page = st.Page(render_dataset_page, title="Dataset", icon="📊")
    training_analysis_page = st.Page(render_training_analysis_page, title="Training Results Analysis", icon="🦾")
-    autogluon_page = st.Page(render_autogluon_analysis_page, title="AutoGluon Analysis", icon="🤖")
    model_state_page = st.Page(render_model_state_page, title="Model State", icon="🧮")
    inference_page = st.Page(render_inference_page, title="Inference", icon="🗺️")

@ -36,7 +34,7 @@ def main():
        {
            "Overview": [overview_page],
            "Data": [data_page],
-            "Experiments": [training_analysis_page, autogluon_page, model_state_page],
+            "Experiments": [training_analysis_page, model_state_page],
            "Inference": [inference_page],
        }
    )
--- a/src/entropice/dashboard/plots/hyperparameter_analysis.py
+++ b/src/entropice/dashboard/plots/hyperparameter_analysis.py
--- a/src/entropice/dashboard/plots/hyperparameter_space.py
+++ b/src/entropice/dashboard/plots/hyperparameter_space.py
@ -0,0 +1,417 @@
+"""Hyperparameter space plotting functions."""
+
+import matplotlib.colors as mcolors
+import pandas as pd
+import plotly.graph_objects as go
+
+from entropice.dashboard.utils.colors import get_cmap, get_palette
+
+
+def plot_performance_summary(results: pd.DataFrame, refit_metric: str) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
+    """Compute performance summary statistics.
+
+    Args:
+        results: DataFrame with CV results.
+        refit_metric: The metric used for refit (e.g., 'f1', 'f1_weighted').
+
+    Returns:
+        Tuple of (best_scores_df, score_stats_df, best_params_dict).
+
+    """
+    # Get all test score columns
+    score_cols = [col for col in results.columns if col.startswith("mean_test_")]
+
+    if not score_cols:
+        return pd.DataFrame(), pd.DataFrame(), {}
+
+    # Calculate best scores
+    best_scores = []
+    for col in score_cols:
+        metric_name = col.replace("mean_test_", "").replace("_", " ").title()
+        best_score = results[col].max()
+        best_scores.append({"Metric": metric_name, "Best Score": f"{best_score:.4f}"})
+
+    # Calculate score statistics
+    score_stats = []
+    for col in score_cols:
+        metric_name = col.replace("mean_test_", "").replace("_", " ").title()
+        mean_score = results[col].mean()
+        std_score = results[col].std()
+        score_stats.append(
+            {
+                "Metric": metric_name,
+                "Mean ± Std": f"{mean_score:.4f} ± {std_score:.4f}",
+            }
+        )
+
+    # Get best parameter combination
+    refit_col = f"mean_test_{refit_metric}"
+    if refit_col not in results.columns and score_cols:
+        refit_col = score_cols[0]
+
+    best_idx = results[refit_col].idxmax()
+    best_row = results.loc[best_idx]
+
+    # Extract parameter columns
+    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
+    best_params = {col.replace("param_", ""): best_row[col] for col in param_cols}
+
+    return pd.DataFrame(best_scores), pd.DataFrame(score_stats), best_params
+
+
+def plot_parameter_distributions(results: pd.DataFrame, param_grid: dict | None = None) -> dict[str, go.Figure]:
+    """Create histogram charts for parameter distributions.
+
+    Args:
+        results: DataFrame with CV results.
+        param_grid: Optional parameter grid with distribution information.
+
+    Returns:
+        Dictionary mapping parameter names to Plotly figures.
+
+    """
+    # Get parameter columns
+    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
+
+    if not param_cols:
+        return {}
+
+    cmap = get_cmap("parameter_distribution")
+    bar_color = mcolors.rgb2hex(cmap(0.5))
+
+    charts = {}
+    for param_col in param_cols:
+        param_name = param_col.replace("param_", "")
+        param_values = results[param_col].dropna()
+
+        if len(param_values) == 0:
+            continue
+
+        # Determine if parameter is numeric
+        if pd.api.types.is_numeric_dtype(param_values):
+            # Create histogram for numeric parameters
+            fig = go.Figure()
+            fig.add_trace(
+                go.Histogram(
+                    x=param_values,
+                    nbinsx=30,
+                    marker_color=bar_color,
+                    name=param_name,
+                )
+            )
+            fig.update_layout(
+                title=f"Distribution of {param_name}",
+                xaxis_title=param_name,
+                yaxis_title="Count",
+                height=400,
+                showlegend=False,
+            )
+        else:
+            # Create bar chart for categorical parameters
+            value_counts = param_values.value_counts().reset_index()
+            value_counts.columns = [param_name, "count"]
+            fig = go.Figure()
+            fig.add_trace(
+                go.Bar(
+                    x=value_counts[param_name],
+                    y=value_counts["count"],
+                    marker_color=bar_color,
+                    name=param_name,
+                )
+            )
+            fig.update_layout(
+                title=f"Distribution of {param_name}",
+                xaxis_title=param_name,
+                yaxis_title="Count",
+                height=400,
+                showlegend=False,
+            )
+
+        charts[param_name] = fig
+
+    return charts
+
+
+def plot_score_vs_parameters(
+    results: pd.DataFrame, metric: str, param_grid: dict | None = None
+) -> dict[str, go.Figure]:
+    """Create scatter plots of score vs each parameter.
+
+    Args:
+        results: DataFrame with CV results.
+        metric: The metric to plot (e.g., 'f1', 'accuracy').
+        param_grid: Optional parameter grid with distribution information.
+
+    Returns:
+        Dictionary mapping parameter names to Plotly figures.
+
+    """
+    score_col = f"mean_test_{metric}"
+    if score_col not in results.columns:
+        return {}
+
+    # Get parameter columns
+    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
+
+    if not param_cols:
+        return {}
+
+    # Get colormap
+    hex_colors = get_palette(metric, n_colors=256)
+
+    charts = {}
+    for param_col in param_cols:
+        param_name = param_col.replace("param_", "")
+        param_values = results[param_col].dropna()
+
+        if len(param_values) == 0:
+            continue
+
+        # Check if this parameter uses log scale
+        use_log = False
+        if param_grid and param_name in param_grid:
+            param_config = param_grid[param_name]
+            if isinstance(param_config, dict) and param_config.get("distribution") == "loguniform":
+                use_log = True
+
+        # Create scatter plot
+        fig = go.Figure()
+        fig.add_trace(
+            go.Scatter(
+                x=results[param_col],
+                y=results[score_col],
+                mode="markers",
+                marker={
+                    "size": 8,
+                    "color": results[score_col],
+                    "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
+                    "showscale": False,
+                    "opacity": 0.6,
+                },
+                text=[
+                    f"{param_name}: {val}<br>Score: {score:.4f}"
+                    for val, score in zip(results[param_col], results[score_col])
+                ],
+                hovertemplate="%{text}<extra></extra>",
+            )
+        )
+        fig.update_layout(
+            title=f"{metric.replace('_', ' ').title()} vs {param_name}",
+            xaxis_title=param_name,
+            xaxis_type="log" if use_log else "linear",
+            yaxis_title=metric.replace("_", " ").title(),
+            height=400,
+            showlegend=False,
+        )
+
+        charts[param_name] = fig
+
+    return charts
+
+
+def plot_parameter_correlations(results: pd.DataFrame, metric: str) -> go.Figure | None:
+    """Create correlation bar chart between parameters and score.
+
+    Args:
+        results: DataFrame with CV results.
+        metric: The metric to analyze (e.g., 'f1', 'accuracy').
+
+    Returns:
+        Plotly figure or None if no numeric parameters found.
+
+    """
+    score_col = f"mean_test_{metric}"
+    if score_col not in results.columns:
+        return None
+
+    # Get numeric parameter columns
+    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
+    numeric_params = [col for col in param_cols if pd.api.types.is_numeric_dtype(results[col])]
+
+    if not numeric_params:
+        return None
+
+    # Calculate correlations
+    correlations = []
+    for param_col in numeric_params:
+        param_name = param_col.replace("param_", "")
+        corr = results[[param_col, score_col]].corr().iloc[0, 1]
+        correlations.append({"Parameter": param_name, "Correlation": corr})
+
+    corr_df = pd.DataFrame(correlations).sort_values("Correlation", ascending=False)
+
+    # Get colormap (use diverging colormap for correlation)
+    hex_colors = get_palette("correlation", n_colors=256)
+
+    # Create bar chart
+    fig = go.Figure()
+    fig.add_trace(
+        go.Bar(
+            x=corr_df["Correlation"],
+            y=corr_df["Parameter"],
+            orientation="h",
+            marker={
+                "color": corr_df["Correlation"],
+                "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
+                "cmin": -1,
+                "cmax": 1,
+                "showscale": False,
+            },
+            text=[f"{c:.3f}" for c in corr_df["Correlation"]],
+            hovertemplate="%{y}<br>Correlation: %{x:.3f}<extra></extra>",
+        )
+    )
+    fig.update_layout(
+        xaxis_title="Correlation with Score",
+        yaxis_title="Parameter",
+        height=max(300, len(correlations) * 30),
+        showlegend=False,
+    )
+
+    return fig
+
+
+def plot_parameter_interactions(results: pd.DataFrame, metric: str, param_grid: dict | None = None) -> list[go.Figure]:
+    """Create scatter plots showing parameter interactions.
+
+    Args:
+        results: DataFrame with CV results.
+        metric: The metric to visualize (e.g., 'f1', 'accuracy').
+        param_grid: Optional parameter grid with distribution information.
+
+    Returns:
+        List of Plotly figures showing parameter interactions.
+
+    """
+    score_col = f"mean_test_{metric}"
+    if score_col not in results.columns:
+        return []
+
+    # Get numeric parameter columns
+    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
+    numeric_params = [col for col in param_cols if pd.api.types.is_numeric_dtype(results[col])]
+
+    if len(numeric_params) < 2:
+        return []
+
+    # Get colormap
+    hex_colors = get_palette(metric, n_colors=256)
+
+    # Create scatter plots for parameter pairs
+    charts = []
+    param_names = [col.replace("param_", "") for col in numeric_params]
+
+    for i, x_param in enumerate(param_names[:-1]):
+        for y_param in param_names[i + 1 :]:
+            x_col = f"param_{x_param}"
+            y_col = f"param_{y_param}"
+
+            # Check if parameters use log scale
+            x_use_log = False
+            y_use_log = False
+            if param_grid:
+                if x_param in param_grid:
+                    x_config = param_grid[x_param]
+                    if isinstance(x_config, dict) and x_config.get("distribution") == "loguniform":
+                        x_use_log = True
+                if y_param in param_grid:
+                    y_config = param_grid[y_param]
+                    if isinstance(y_config, dict) and y_config.get("distribution") == "loguniform":
+                        y_use_log = True
+
+            fig = go.Figure()
+            fig.add_trace(
+                go.Scatter(
+                    x=results[x_col],
+                    y=results[y_col],
+                    mode="markers",
+                    marker={
+                        "size": 8,
+                        "color": results[score_col],
+                        "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
+                        "showscale": True,
+                        "colorbar": {"title": metric.replace("_", " ").title()},
+                        "opacity": 0.7,
+                    },
+                    text=[
+                        f"{x_param}: {x_val}<br>{y_param}: {y_val}<br>Score: {score:.4f}"
+                        for x_val, y_val, score in zip(results[x_col], results[y_col], results[score_col])
+                    ],
+                    hovertemplate="%{text}<extra></extra>",
+                )
+            )
+            fig.update_layout(
+                title=f"{metric.replace('_', ' ').title()} by {x_param} and {y_param}",
+                xaxis_title=x_param,
+                xaxis_type="log" if x_use_log else "linear",
+                yaxis_title=y_param,
+                yaxis_type="log" if y_use_log else "linear",
+                height=500,
+                width=500,
+            )
+
+            charts.append(fig)
+
+    return charts
+
+
+def plot_score_evolution(results: pd.DataFrame, metric: str) -> go.Figure | None:
+    """Create line chart showing score evolution over iterations.
+
+    Args:
+        results: DataFrame with CV results.
+        metric: The metric to visualize (e.g., 'f1', 'accuracy').
+
+    Returns:
+        Plotly figure or None if metric not found.
+
+    """
+    score_col = f"mean_test_{metric}"
+    if score_col not in results.columns:
+        return None
+
+    # Add iteration number
+    iterations = list(range(len(results)))
+    scores = results[score_col].to_numpy()
+    best_so_far = results[score_col].cummax().to_numpy()
+
+    # Get colormap
+    cmap = get_cmap("score_evolution")
+    score_color = mcolors.rgb2hex(cmap(0.3))
+    best_color = mcolors.rgb2hex(cmap(0.7))
+
+    # Create line chart
+    fig = go.Figure()
+
+    fig.add_trace(
+        go.Scatter(
+            x=iterations,
+            y=scores,
+            mode="lines",
+            name="Score",
+            line={"color": score_color, "width": 1},
+            opacity=0.6,
+            hovertemplate="Iteration: %{x}<br>Score: %{y:.4f}<extra></extra>",
+        )
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=iterations,
+            y=best_so_far,
+            mode="lines",
+            name="Best So Far",
+            line={"color": best_color, "width": 2},
+            hovertemplate="Iteration: %{x}<br>Best So Far: %{y:.4f}<extra></extra>",
+        )
+    )
+
+    fig.update_layout(
+        title=f"{metric.replace('_', ' ').title()} Evolution",
+        xaxis_title="Iteration",
+        yaxis_title=metric.replace("_", " ").title(),
+        height=300,
+        hovermode="x unified",
+    )
+
+    return fig
--- a/src/entropice/dashboard/plots/metrics.py
+++ b/src/entropice/dashboard/plots/metrics.py
@ -0,0 +1,97 @@
+"""Metrics visualization plots."""
+
+import numpy as np
+import plotly.graph_objects as go
+import xarray as xr
+
+
+def plot_confusion_matrix(cm_data: xr.DataArray, title: str = "Confusion Matrix", normalize: str = "none") -> go.Figure:
+    """Plot an interactive confusion matrix heatmap.
+
+    Args:
+        cm_data: XArray DataArray with confusion matrix data (dimensions: true_label, predicted_label).
+        title: Title for the plot.
+        normalize: Normalization mode - "none", "true", or "pred".
+
+    Returns:
+        Plotly figure with the interactive confusion matrix heatmap.
+
+    """
+    # Get the data as numpy array
+    cm_array = cm_data.values.astype(float)
+    labels = cm_data.coords["true_label"].values.tolist()
+
+    # Store original counts for display
+    cm_counts = cm_data.values
+
+    # Apply normalization
+    if normalize == "true":
+        # Normalize over true labels (rows) - each row sums to 1
+        row_sums = cm_array.sum(axis=1, keepdims=True)
+        cm_normalized = np.divide(cm_array, row_sums, where=row_sums != 0)
+        colorbar_title = "Proportion"
+    elif normalize == "pred":
+        # Normalize over predicted labels (columns) - each column sums to 1
+        col_sums = cm_array.sum(axis=0, keepdims=True)
+        cm_normalized = np.divide(cm_array, col_sums, where=col_sums != 0)
+        colorbar_title = "Proportion"
+    else:
+        # No normalization
+        cm_normalized = cm_array
+        colorbar_title = "Count"
+
+    # Create annotations for the heatmap
+    annotations = []
+    for i, true_label in enumerate(labels):
+        for j, pred_label in enumerate(labels):
+            count = int(cm_counts[i, j])
+            normalized_val = cm_normalized[i, j]
+
+            # Format text based on normalization mode
+            if normalize == "none":
+                # Show count and percentage of total
+                total = cm_counts.sum()
+                pct = (count / total * 100) if total > 0 else 0
+                text = f"{count}<br>({pct:.1f}%)"
+            else:
+                # Show percentage only for normalized versions
+                text = f"{normalized_val:.1%}"
+
+            # Determine text color based on normalized value
+            threshold = cm_normalized.max() / 2 if cm_normalized.max() > 0 else 0.5
+            text_color = "white" if normalized_val > threshold else "black"
+
+            annotations.append(
+                {
+                    "x": pred_label,
+                    "y": true_label,
+                    "text": text,
+                    "showarrow": False,
+                    "font": {"size": 10, "color": text_color},
+                }
+            )
+
+    # Create the heatmap with normalized values for coloring
+    fig = go.Figure(
+        data=go.Heatmap(
+            z=cm_normalized,
+            x=labels,
+            y=labels,
+            colorscale="Blues",
+            colorbar={"title": colorbar_title},
+            hoverongaps=False,
+            hovertemplate="True: %{y}<br>Predicted: %{x}<br>Count: %{customdata}<extra></extra>",
+            customdata=cm_counts,
+        )
+    )
+
+    # Add annotations
+    fig.update_layout(
+        annotations=annotations,
+        xaxis={"title": "Predicted Label", "side": "bottom"},
+        yaxis={"title": "True Label", "autorange": "reversed"},
+        width=600,
+        height=550,
+    )
+
+    return fig
--- a/src/entropice/dashboard/plots/regression.py
+++ b/src/entropice/dashboard/plots/regression.py
@ -0,0 +1,180 @@
+"""Regression analysis plotting functions."""
+
+from typing import cast
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+
+from entropice.dashboard.utils.colors import get_palette
+
+
+def plot_regression_scatter(
+    y_true: np.ndarray | pd.Series,
+    y_pred: np.ndarray | pd.Series,
+    title: str = "True vs Predicted",
+) -> go.Figure:
+    """Create scatter plot of true vs predicted values for regression.
+
+    Args:
+        y_true: True target values.
+        y_pred: Predicted target values.
+        title: Title for the plot.
+
+    Returns:
+        Plotly figure with regression scatter plot.
+
+    """
+    # Convert to numpy arrays if needed
+    y_true_np = cast(np.ndarray, y_true.to_numpy()) if isinstance(y_true, pd.Series) else y_true
+    y_pred_np = cast(np.ndarray, y_pred.to_numpy()) if isinstance(y_pred, pd.Series) else y_pred
+
+    # Calculate metrics
+    mse = np.mean((y_true_np - y_pred_np) ** 2)
+    mae = np.mean(np.abs(y_true_np - y_pred_np))
+    r2 = 1 - (np.sum((y_true_np - y_pred_np) ** 2) / np.sum((y_true_np - np.mean(y_true_np)) ** 2))
+
+    # Get colormap
+    hex_colors = get_palette("r2", n_colors=256)
+
+    # Calculate point density for coloring
+    from scipy.stats import gaussian_kde
+
+    try:
+        # Create KDE for density estimation
+        xy = np.vstack([y_true_np, y_pred_np])
+        kde = gaussian_kde(xy)
+        density = kde(xy)
+    except (np.linalg.LinAlgError, ValueError):
+        # Fallback if KDE fails (e.g., all points identical)
+        density = np.ones(len(y_true_np))
+
+    # Create figure
+    fig = go.Figure()
+
+    # Add scatter plot
+    fig.add_trace(
+        go.Scatter(
+            x=y_true_np,
+            y=y_pred_np,
+            mode="markers",
+            marker={
+                "size": 6,
+                "color": density,
+                "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
+                "showscale": False,
+                "opacity": 0.6,
+            },
+            text=[f"True: {true:.3f}<br>Pred: {pred:.3f}" for true, pred in zip(y_true_np, y_pred_np)],
+            hovertemplate="%{text}<extra></extra>",
+            name="Data",
+        )
+    )
+
+    # Add diagonal line (perfect prediction)
+    min_val = min(y_true_np.min(), y_pred_np.min())
+    max_val = max(y_true_np.max(), y_pred_np.max())
+    fig.add_trace(
+        go.Scatter(
+            x=[min_val, max_val],
+            y=[min_val, max_val],
+            mode="lines",
+            line={"color": "red", "dash": "dash", "width": 2},
+            name="Perfect Prediction",
+            hovertemplate="y = x<extra></extra>",
+        )
+    )
+
+    # Add metrics as annotation
+    metrics_text = f"R² = {r2:.4f}<br>MSE = {mse:.4f}<br>MAE = {mae:.4f}"
+
+    fig.add_annotation(
+        x=0.02,
+        y=0.98,
+        xref="paper",
+        yref="paper",
+        text=metrics_text,
+        showarrow=False,
+        bgcolor="white",
+        bordercolor="black",
+        borderwidth=1,
+        xanchor="left",
+        yanchor="top",
+        font={"size": 12},
+    )
+
+    fig.update_layout(
+        title=title,
+        xaxis_title="True Values",
+        yaxis_title="Predicted Values",
+        height=500,
+        showlegend=True,
+        legend={"x": 0.98, "y": 0.02, "xanchor": "right", "yanchor": "bottom"},
+    )
+
+    # Make axes equal
+    fig.update_xaxes(scaleanchor="y", scaleratio=1)
+
+    return fig
+
+
+def plot_residuals(
+    y_true: np.ndarray | pd.Series,
+    y_pred: np.ndarray | pd.Series,
+    title: str = "Residual Plot",
+) -> go.Figure:
+    """Create residual plot for regression diagnostics.
+
+    Args:
+        y_true: True target values.
+        y_pred: Predicted target values.
+        title: Title for the plot.
+
+    Returns:
+        Plotly figure with residual plot.
+
+    """
+    # Convert to numpy arrays if needed
+    y_true_np = cast(np.ndarray, y_true.to_numpy()) if isinstance(y_true, pd.Series) else y_true
+    y_pred_np = cast(np.ndarray, y_pred.to_numpy()) if isinstance(y_pred, pd.Series) else y_pred
+
+    # Calculate residuals
+    residuals = y_true_np - y_pred_np
+
+    # Get colormap
+    hex_colors = get_palette("r2", n_colors=256)
+
+    # Create figure
+    fig = go.Figure()
+
+    # Add scatter plot
+    fig.add_trace(
+        go.Scatter(
+            x=y_pred,
+            y=residuals,
+            mode="markers",
+            marker={
+                "size": 6,
+                "color": np.abs(residuals),
+                "colorscale": [[i / 255, c] for i, c in enumerate(hex_colors)],
+                "showscale": True,
+                "colorbar": {"title": "Abs Residual"},
+                "opacity": 0.6,
+            },
+            text=[f"Pred: {pred:.3f}<br>Residual: {res:.3f}" for pred, res in zip(y_pred, residuals)],
+            hovertemplate="%{text}<extra></extra>",
+        )
+    )
+
+    # Add zero line
+    fig.add_hline(y=0, line_dash="dash", line_color="red", line_width=2)
+
+    fig.update_layout(
+        title=title,
+        xaxis_title="Predicted Values",
+        yaxis_title="Residuals (True - Predicted)",
+        height=400,
+        showlegend=False,
+    )
+
+    return fig
--- a/src/entropice/dashboard/sections/cv_result.py
+++ b/src/entropice/dashboard/sections/cv_result.py
@ -0,0 +1,185 @@
+"""Training Result Sections."""
+
+import streamlit as st
+
+from entropice.dashboard.plots.metrics import plot_confusion_matrix
+from entropice.dashboard.utils.formatters import format_metric_name
+from entropice.dashboard.utils.loaders import TrainingResult
+from entropice.dashboard.utils.stats import CVMetricStatistics
+from entropice.utils.types import GridConfig
+
+
+def render_run_information(selected_result: TrainingResult, refit_metric):
+    """Render training run configuration overview.
+
+    Args:
+        selected_result: The selected TrainingResult object.
+        refit_metric: The refit metric used for model selection.
+
+    """
+    st.header("📋 Run Information")
+
+    grid_config = GridConfig.from_grid_level(f"{selected_result.settings.grid}{selected_result.settings.level}")  # ty:ignore[invalid-argument-type]
+
+    col1, col2, col3, col4, col5 = st.columns(5)
+    with col1:
+        st.metric("Task", selected_result.settings.task.capitalize())
+    with col2:
+        st.metric("Target", selected_result.settings.target.capitalize())
+    with col3:
+        st.metric("Grid", grid_config.display_name)
+    with col4:
+        st.metric("Model", selected_result.settings.model.upper())
+    with col5:
+        st.metric("Trials", len(selected_result.results))
+
+    st.caption(f"**Refit Metric:** {format_metric_name(refit_metric)}")
+
+
+def _render_metrics(metrics: dict[str, float]):
+    """Render a set of metrics in a two-column layout.
+
+    Args:
+        metrics: Dictionary of metric names and their values.
+
+    """
+    ncols = min(5, len(metrics))
+    cols = st.columns(ncols)
+    for idx, (metric_name, metric_value) in enumerate(metrics.items()):
+        with cols[idx % ncols]:
+            st.metric(format_metric_name(metric_name), f"{metric_value:.4f}")
+
+
+def render_metrics_section(selected_result: TrainingResult):
+    """Render test metrics overview showing final model performance.
+
+    Args:
+        selected_result: The selected TrainingResult object.
+
+    """
+    # Test
+    st.header("🎯 Test Set Performance")
+    st.caption("Performance metrics on the held-out test set (best model from hyperparameter search)")
+    _render_metrics(selected_result.test_metrics)
+
+    # Train
+    st.header("🏋️‍♂️ Training Set Performance")
+    st.caption("Performance metrics on the training set (best model from hyperparameter search)")
+    _render_metrics(selected_result.train_metrics)
+
+    # Combined / All
+    st.header("🧮 Overall Performance")
+    st.caption("Overall performance metrics combining training and test sets")
+    _render_metrics(selected_result.combined_metrics)
+
+
+@st.fragment
+def render_confusion_matrices(selected_result: TrainingResult):
+    """Render confusion matrices for classification tasks.
+
+    Args:
+        selected_result: The selected TrainingResult object.
+
+    """
+    st.header("🎭 Confusion Matrices")
+
+    # Check if this is a classification task
+    if selected_result.settings.task not in ["binary", "count_regimes", "density_regimes"]:
+        st.info(
+            "📊 Confusion matrices are only available for classification tasks "
+            "(binary, count_regimes, density_regimes)."
+        )
+        st.caption("Coming soon for regression tasks: residual plots and error distributions.")
+        return
+
+    # Check if confusion matrix data is available
+    if selected_result.confusion_matrix is None:
+        st.warning("⚠️ No confusion matrix data found for this training result.")
+        return
+
+    cm = selected_result.confusion_matrix
+
+    # Add normalization selection
+    st.subheader("Display Options")
+    normalize_option = st.radio(
+        "Normalization",
+        options=["No normalization", "Normalize over True Labels", "Normalize over Predicted Labels"],
+        horizontal=True,
+        help="Choose how to normalize the confusion matrix values",
+    )
+
+    # Map selection to normalization mode
+    normalize_map = {
+        "No normalization": "none",
+        "Normalize over True Labels": "true",
+        "Normalize over Predicted Labels": "pred",
+    }
+    normalize_mode = normalize_map[normalize_option]
+
+    cols = st.columns(3)
+
+    with cols[0]:
+        # Test Set Confusion Matrix
+        st.subheader("Test Set")
+        st.caption("Held-out test set")
+        fig_test = plot_confusion_matrix(cm["test"], title="Test Set", normalize=normalize_mode)
+        st.plotly_chart(fig_test, width="stretch")
+    with cols[1]:
+        # Training Set Confusion Matrix
+        st.subheader("Training Set")
+        st.caption("Training set")
+        fig_train = plot_confusion_matrix(cm["train"], title="Training Set", normalize=normalize_mode)
+        st.plotly_chart(fig_train, width="stretch")
+    with cols[2]:
+        # Combined Confusion Matrix
+        st.subheader("Combined")
+        st.caption("Train + Test sets")
+        fig_combined = plot_confusion_matrix(cm["combined"], title="Combined", normalize=normalize_mode)
+        st.plotly_chart(fig_combined, width="stretch")
+
+
+def render_cv_statistics_section(cv_stats: CVMetricStatistics, test_score: float):
+    """Render cross-validation statistics for selected metric.
+
+    Args:
+        cv_stats: CVMetricStatistics object containing cross-validation statistics.
+        test_score: The test set score for the selected metric.
+
+    """
+    st.header("📈 Cross-Validation Statistics")
+    st.caption("Performance during hyperparameter search (averaged across CV folds)")
+
+    col1, col2, col3, col4, col5 = st.columns(5)
+
+    with col1:
+        st.metric("Best Score", f"{cv_stats.best_score:.4f}")
+    with col2:
+        st.metric("Mean Score", f"{cv_stats.mean_score:.4f}")
+    with col3:
+        st.metric("Std Dev", f"{cv_stats.std_score:.4f}")
+    with col4:
+        st.metric("Worst Score", f"{cv_stats.worst_score:.4f}")
+    with col5:
+        st.metric("Median Score", f"{cv_stats.median_score:.4f}")
+
+    if cv_stats.mean_cv_std is not None:
+        st.info(f"**Mean CV Std:** {cv_stats.mean_cv_std:.4f} - Average standard deviation across CV folds")
+
+    # Compare with test metric
+    st.subheader("CV vs Test Performance")
+
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Best CV Score", f"{cv_stats.best_score:.4f}")
+    with col2:
+        st.metric("Test Score", f"{test_score:.4f}")
+    with col3:
+        delta = test_score - cv_stats.best_score
+        delta_pct = (delta / cv_stats.best_score * 100) if cv_stats.best_score != 0 else 0
+        st.metric("Difference", f"{delta:+.4f}", delta=f"{delta_pct:+.2f}%")
+
+    if abs(delta) > cv_stats.std_score:
+        st.warning(
+            "⚠️ Test performance differs significantly (larger than the CV standard deviation) from CV performance. "
+            "This may indicate overfitting or data distribution mismatch between training and test sets."
+        )
--- a/src/entropice/dashboard/sections/experiment_results.py
+++ b/src/entropice/dashboard/sections/experiment_results.py
@ -2,15 +2,16 @@

 from datetime import datetime

+import pandas as pd
 import streamlit as st

-from entropice.dashboard.utils.loaders import TrainingResult
+from entropice.dashboard.utils.loaders import AutogluonTrainingResult, TrainingResult
 from entropice.utils.types import (
    GridConfig,
 )


-def render_training_results_summary(training_results: list[TrainingResult]):
+def render_training_results_summary(training_results: list[TrainingResult | AutogluonTrainingResult]):
    """Render summary metrics for training results."""
    st.header("📊 Training Results Summary")
    col1, col2, col3, col4 = st.columns(4)
@ -23,7 +24,7 @@ def render_training_results_summary(training_results: list[TrainingResult]):
        st.metric("Total Runs", len(training_results))

    with col3:
-        models = {tr.settings.model for tr in training_results}
+        models = {tr.settings.model for tr in training_results if hasattr(tr.settings, "model")}
        st.metric("Model Types", len(models))

    with col4:
@ -33,14 +34,14 @@ def render_training_results_summary(training_results: list[TrainingResult]):


@st.fragment
-def render_experiment_results(training_results: list[TrainingResult]):  # noqa: C901
+def render_experiment_results(training_results: list[TrainingResult | AutogluonTrainingResult]):  # noqa: C901
    """Render detailed experiment results table and expandable details."""
    st.header("🎯 Experiment Results")

    # Filters
    experiments = sorted({tr.experiment for tr in training_results if tr.experiment})
    tasks = sorted({tr.settings.task for tr in training_results})
-    models = sorted({tr.settings.model for tr in training_results})
+    models = sorted({tr.settings.model if isinstance(tr, TrainingResult) else "autogluon" for tr in training_results})
    grids = sorted({f"{tr.settings.grid}-{tr.settings.level}" for tr in training_results})

    # Create filter columns
@ -87,14 +88,26 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
        filtered_results = [tr for tr in filtered_results if tr.experiment == selected_experiment]
    if selected_task != "All":
        filtered_results = [tr for tr in filtered_results if tr.settings.task == selected_task]
-    if selected_model != "All":
-        filtered_results = [tr for tr in filtered_results if tr.settings.model == selected_model]
+    if selected_model != "All" and selected_model != "autogluon":
+        filtered_results = [
+            tr for tr in filtered_results if isinstance(tr, TrainingResult) and tr.settings.model == selected_model
+        ]
+    elif selected_model == "autogluon":
+        filtered_results = [tr for tr in filtered_results if isinstance(tr, AutogluonTrainingResult)]
    if selected_grid != "All":
        filtered_results = [tr for tr in filtered_results if f"{tr.settings.grid}-{tr.settings.level}" == selected_grid]

    st.subheader("Results Table")

-    summary_df = TrainingResult.to_dataframe(filtered_results)
+    summary_df = TrainingResult.to_dataframe([tr for tr in filtered_results if isinstance(tr, TrainingResult)])
+    autogluon_df = AutogluonTrainingResult.to_dataframe(
+        [tr for tr in filtered_results if isinstance(tr, AutogluonTrainingResult)]
+    )
+    if len(summary_df) == 0:
+        summary_df = autogluon_df
+    elif len(autogluon_df) > 0:
+        summary_df = pd.concat([summary_df, autogluon_df], ignore_index=True)
+
    # Display with color coding for best scores
    st.dataframe(
        summary_df,
@ -107,6 +120,8 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
        for tr in filtered_results:
            tr_info = tr.display_info
            display_name = tr_info.get_display_name("model_first")
+            model = "autogluon" if isinstance(tr, AutogluonTrainingResult) else tr.settings.model
+            cv_splits = tr.settings.cv_splits if hasattr(tr.settings, "cv_splits") else "N/A"
            with st.expander(display_name):
                col1, col2 = st.columns([1, 2])

@ -117,12 +132,12 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
                        f"- **Experiment:** {tr.experiment}\n"
                        f"- **Task:** {tr.settings.task}\n"
                        f"- **Target:** {tr.settings.target}\n"
-                        f"- **Model:** {tr.settings.model}\n"
+                        f"- **Model:** {model}\n"
                        f"- **Grid:** {grid_config.display_name}\n"
                        f"- **Created At:** {tr_info.timestamp.strftime('%Y-%m-%d %H:%M')}\n"
                        f"- **Temporal Mode:** {tr.settings.temporal_mode}\n"
                        f"- **Members:** {', '.join(tr.settings.members)}\n"
-                        f"- **CV Splits:** {tr.settings.cv_splits}\n"
+                        f"- **CV Splits:** {cv_splits}\n"
                        f"- **Classes:** {tr.settings.classes}\n"
                    )

@ -140,8 +155,11 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
                            file_str += f"- 📄 `{file.name}`\n"
                    st.write(file_str)
                with col2:
+                    if isinstance(tr, AutogluonTrainingResult):
+                        st.write("**Leaderboard:**")
+                        st.dataframe(tr.leaderboard, width="stretch", hide_index=True)
+                    else:
                        st.write("**CV Score Summary:**")
-
                        # Extract all test scores
                        metric_df = tr.get_metric_dataframe()
                        if metric_df is not None:
--- a/src/entropice/dashboard/sections/hparam_space.py
+++ b/src/entropice/dashboard/sections/hparam_space.py
@ -0,0 +1,172 @@
+"""Hyperparameter Space Visualization Section."""
+
+import streamlit as st
+
+from entropice.dashboard.plots.hyperparameter_space import (
+    plot_parameter_correlations,
+    plot_parameter_distributions,
+    plot_parameter_interactions,
+    plot_score_evolution,
+    plot_score_vs_parameters,
+)
+from entropice.dashboard.utils.formatters import format_metric_name
+from entropice.dashboard.utils.loaders import TrainingResult
+
+
+def _render_performance_summary(results, refit_metric: str):
+    """Render performance summary subsection."""
+    best_idx = results[f"mean_test_{refit_metric}"].idxmax()
+    best_row = results.loc[best_idx]
+    # Extract parameter columns
+    param_cols = [col for col in results.columns if col.startswith("param_") and col != "params"]
+    best_params = {col.replace("param_", ""): best_row[col] for col in param_cols}
+
+    # Display best parameter combination
+    if not best_params:
+        return
+
+    with st.container(border=True):
+        st.subheader("🏆 Best Parameter Combination")
+        st.caption(f"Parameters of the best model (selected by {format_metric_name(refit_metric)} score)")
+        n_params = len(best_params)
+        cols = st.columns(n_params)
+        for idx, (param_name, param_value) in enumerate(best_params.items()):
+            with cols[idx]:
+                # Format value based on type and magnitude
+                if isinstance(param_value, int):
+                    formatted_value = f"{param_value:.0f}"
+                elif isinstance(param_value, float):
+                    # Use scientific notation for very small numbers
+                    if abs(param_value) < 0.001 and param_value != 0:
+                        formatted_value = f"{param_value:.2e}"
+                    else:
+                        formatted_value = f"{param_value:.4f}"
+                else:
+                    formatted_value = str(param_value)
+
+                st.metric(param_name, formatted_value)
+
+
+def _render_parameter_distributions(results, param_grid: dict | None):
+    """Render parameter distributions subsection."""
+    st.subheader("Parameter Distributions")
+    st.caption("Distribution of hyperparameter values explored during random search")
+
+    param_charts = plot_parameter_distributions(results, param_grid)
+
+    if not param_charts:
+        st.info("No parameter distribution data available.")
+        return
+
+    # Display charts in a grid
+    param_names = list(param_charts.keys())
+    n_cols = min(3, len(param_names))
+    n_rows = (len(param_names) + n_cols - 1) // n_cols
+
+    for row in range(n_rows):
+        cols = st.columns(n_cols)
+        for col_idx in range(n_cols):
+            param_idx = row * n_cols + col_idx
+            if param_idx < len(param_names):
+                param_name = param_names[param_idx]
+                with cols[col_idx]:
+                    st.plotly_chart(param_charts[param_name], width="stretch")
+
+
+def _render_score_evolution(results, selected_metric: str):
+    """Render score evolution subsection."""
+    st.subheader("Score Evolution Over Iterations")
+    st.caption(f"How {format_metric_name(selected_metric)} evolved during the random search")
+
+    evolution_chart = plot_score_evolution(results, selected_metric)
+    if evolution_chart:
+        st.plotly_chart(evolution_chart, width="stretch")
+    else:
+        st.warning(f"Score evolution not available for metric: {selected_metric}")
+
+
+def _render_score_vs_parameters(results, selected_metric: str, param_grid: dict | None):
+    """Render score vs parameters subsection."""
+    st.subheader("Score vs Individual Parameters")
+    st.caption(f"Relationship between {format_metric_name(selected_metric)} and each hyperparameter")
+
+    score_vs_param_charts = plot_score_vs_parameters(results, selected_metric, param_grid)
+
+    if not score_vs_param_charts:
+        st.info("No score vs parameter data available.")
+        return
+
+    param_names = list(score_vs_param_charts.keys())
+    n_cols = min(2, len(param_names))
+    n_rows = (len(param_names) + n_cols - 1) // n_cols
+
+    for row in range(n_rows):
+        cols = st.columns(n_cols)
+        for col_idx in range(n_cols):
+            param_idx = row * n_cols + col_idx
+            if param_idx < len(param_names):
+                param_name = param_names[param_idx]
+                with cols[col_idx]:
+                    st.plotly_chart(score_vs_param_charts[param_name], width="stretch")
+
+
+def _render_parameter_correlations(results, selected_metric: str):
+    """Render parameter correlations subsection."""
+    st.subheader("Parameter-Score Correlations")
+    st.caption(f"Correlation between numeric parameters and {format_metric_name(selected_metric)}")
+
+    corr_chart = plot_parameter_correlations(results, selected_metric)
+    if corr_chart:
+        st.plotly_chart(corr_chart, width="stretch")
+    else:
+        st.info("No numeric parameters found for correlation analysis.")
+
+
+def _render_parameter_interactions(results, selected_metric: str, param_grid: dict | None):
+    """Render parameter interactions subsection."""
+    st.subheader("Parameter Interactions")
+    st.caption(f"Interaction between parameter pairs and their effect on {format_metric_name(selected_metric)}")
+
+    interaction_charts = plot_parameter_interactions(results, selected_metric, param_grid)
+
+    if not interaction_charts:
+        st.info("Not enough numeric parameters for parameter interaction visualization.")
+        return
+
+    n_cols = min(2, len(interaction_charts))
+    n_rows = (len(interaction_charts) + n_cols - 1) // n_cols
+
+    for row in range(n_rows):
+        cols = st.columns(n_cols)
+        for col_idx in range(n_cols):
+            chart_idx = row * n_cols + col_idx
+            if chart_idx < len(interaction_charts):
+                with cols[col_idx]:
+                    st.plotly_chart(interaction_charts[chart_idx], width="stretch")
+
+
+def render_hparam_space_section(selected_result: TrainingResult, selected_metric: str):
+    """Render the hyperparameter space visualization section.
+
+    Args:
+        selected_result: The selected TrainingResult object.
+        selected_metric: The metric to focus analysis on.
+
+    """
+    st.header("🧩 Hyperparameter Space Exploration")
+
+    results = selected_result.results
+    refit_metric = selected_result._get_best_metric_name()
+    param_grid = selected_result.settings.param_grid
+
+    _render_performance_summary(results, refit_metric)
+
+    _render_parameter_distributions(results, param_grid)
+
+    _render_score_evolution(results, selected_metric)
+
+    _render_score_vs_parameters(results, selected_metric, param_grid)
+
+    _render_parameter_correlations(results, selected_metric)
+
+    _render_parameter_interactions(results, selected_metric, param_grid)
--- a/src/entropice/dashboard/sections/regression_analysis.py
+++ b/src/entropice/dashboard/sections/regression_analysis.py
@ -0,0 +1,122 @@
+"""Regression Analysis Section."""
+
+import streamlit as st
+
+from entropice.dashboard.plots.regression import plot_regression_scatter, plot_residuals
+from entropice.dashboard.utils.loaders import TrainingResult
+from entropice.ml.dataset import DatasetEnsemble
+
+
+def render_regression_analysis(selected_result: TrainingResult):
+    """Render regression analysis with true vs predicted scatter plots.
+
+    Args:
+        selected_result: The selected TrainingResult object.
+
+    """
+    st.header("📊 Regression Analysis")
+
+    # Check if this is a regression task
+    if selected_result.settings.task in ["binary", "count_regimes", "density_regimes"]:
+        st.info("📈 Regression analysis is only available for regression tasks (count, density).")
+        return
+
+    # Load predictions
+    predictions_df = selected_result.load_predictions()
+    if predictions_df is None:
+        st.warning("⚠️ No prediction data found for this training result.")
+        return
+
+    # Create DatasetEnsemble from settings
+    with st.spinner("Loading training data to get true values..."):
+        ensemble = DatasetEnsemble(
+            grid=selected_result.settings.grid,
+            level=selected_result.settings.level,
+            members=selected_result.settings.members,
+            temporal_mode=selected_result.settings.temporal_mode,
+            dimension_filters=selected_result.settings.dimension_filters,
+            variable_filters=selected_result.settings.variable_filters,
+            add_lonlat=selected_result.settings.add_lonlat,
+        )
+
+        # Create training set to get true values
+        training_set = ensemble.create_training_set(
+            task=selected_result.settings.task,
+            target=selected_result.settings.target,
+            device="cpu",
+            cache_mode="read",
+        )
+
+    # Get split information
+    split_series = training_set.split
+
+    # Merge predictions with true values and split info
+    # predictions_df should have 'cell_id' and 'predicted' columns
+    # training_set.targets has 'y' (true values) with cell_id as index
+    true_values = training_set.targets[["y"]].reset_index()
+
+    # Merge on cell_id
+    merged = predictions_df.merge(true_values, on="cell_id", how="inner")
+    merged["split"] = split_series.reindex(merged["cell_id"]).values
+
+    # Get train, test, and combined data
+    train_data = merged[merged["split"] == "train"]
+    test_data = merged[merged["split"] == "test"]
+
+    if len(train_data) == 0 or len(test_data) == 0:
+        st.error("❌ Could not properly split data into train and test sets.")
+        return
+
+    # Display scatter plots
+    st.subheader("True vs Predicted Values")
+    st.caption("Scatter plots showing the relationship between true and predicted values")
+
+    cols = st.columns(3)
+
+    with cols[0]:
+        st.markdown("#### Test Set")
+        st.caption("Held-out test set")
+        fig_test = plot_regression_scatter(
+            test_data["y"],
+            test_data["predicted"],
+            title="Test Set",
+        )
+        st.plotly_chart(fig_test, use_container_width=True)
+
+    with cols[1]:
+        st.markdown("#### Training Set")
+        st.caption("Training set")
+        fig_train = plot_regression_scatter(
+            train_data["y"],
+            train_data["predicted"],
+            title="Training Set",
+        )
+        st.plotly_chart(fig_train, use_container_width=True)
+
+    with cols[2]:
+        st.markdown("#### Combined")
+        st.caption("Train + Test sets")
+        fig_combined = plot_regression_scatter(
+            merged["y"],
+            merged["predicted"],
+            title="Combined",
+        )
+        st.plotly_chart(fig_combined, use_container_width=True)
+
+    # Display residual plots
+    st.subheader("Residual Analysis")
+    st.caption("Residual plots to assess model fit and identify patterns in errors")
+
+    cols = st.columns(3)
+
+    with cols[0]:
+        fig_test_res = plot_residuals(test_data["y"], test_data["predicted"], title="Test Set Residuals")
+        st.plotly_chart(fig_test_res, use_container_width=True)
+
+    with cols[1]:
+        fig_train_res = plot_residuals(train_data["y"], train_data["predicted"], title="Training Set Residuals")
+        st.plotly_chart(fig_train_res, use_container_width=True)
+
+    with cols[2]:
+        fig_combined_res = plot_residuals(merged["y"], merged["predicted"], title="Combined Residuals")
+        st.plotly_chart(fig_combined_res, use_container_width=True)
--- a/src/entropice/dashboard/utils/class_ordering.py
+++ b/src/entropice/dashboard/utils/class_ordering.py
@ -1,70 +0,0 @@
-"""Utilities for ordering predicted classes consistently across visualizations.
-
-This module leverages the canonical class labels defined in the ML dataset module
-to ensure consistent ordering across all visualizations.
-"""
-
-import pandas as pd
-
-from entropice.utils.types import Task
-
-# Canonical orderings imported from the ML pipeline
-# Binary labels are defined inline in dataset.py: {False: "No RTS", True: "RTS"}
-# Count/Density labels are defined in the bin_values function
-BINARY_LABELS = ["No RTS", "RTS"]
-COUNT_LABELS = ["None", "Very Few", "Few", "Several", "Many", "Very Many"]
-DENSITY_LABELS = ["Empty", "Very Sparse", "Sparse", "Moderate", "Dense", "Very Dense"]
-
-CLASS_ORDERINGS: dict[Task | str, list[str]] = {
-    "binary": BINARY_LABELS,
-    "count": COUNT_LABELS,
-    "density": DENSITY_LABELS,
-}
-
-
-def get_ordered_classes(task: Task | str, available_classes: list[str] | None = None) -> list[str]:
-    """Get properly ordered class labels for a given task.
-
-    This uses the same canonical ordering as defined in the ML dataset module,
-    ensuring consistency between training and inference visualizations.
-
-    Args:
-        task: Task type ('binary', 'count', 'density').
-        available_classes: Optional list of available classes to filter and order.
-                          If None, returns all canonical classes for the task.
-
-    Returns:
-        List of class labels in proper order.
-
-    Examples:
-        >>> get_ordered_classes("binary")
-        ['No RTS', 'RTS']
-        >>> get_ordered_classes("count", ["None", "Few", "Several"])
-        ['None', 'Few', 'Several']
-
-    """
-    canonical_order = CLASS_ORDERINGS[task]
-
-    if available_classes is None:
-        return canonical_order
-
-    # Filter canonical order to only include available classes, preserving order
-    return [cls for cls in canonical_order if cls in available_classes]
-
-
-def sort_class_series(series: pd.Series, task: Task | str) -> pd.Series:
-    """Sort a pandas Series with class labels according to canonical ordering.
-
-    Args:
-        series: Pandas Series with class labels as index.
-        task: Task type ('binary', 'count', 'density').
-
-    Returns:
-        Sorted Series with classes in canonical order.
-
-    """
-    available_classes = series.index.tolist()
-    ordered_classes = get_ordered_classes(task, available_classes)
-
-    # Reindex to get proper order
-    return series.reindex(ordered_classes)
--- a/src/entropice/dashboard/utils/formatters.py
+++ b/src/entropice/dashboard/utils/formatters.py
@ -59,7 +59,7 @@ task_display_infos: dict[Task, TaskDisplayInfo] = {
 class TrainingResultDisplayInfo:
    task: Task
    target: TargetDataset
-    model: Model
+    model: Model | Literal["autogluon"]
    grid: Grid
    level: int
    timestamp: datetime
--- a/src/entropice/dashboard/utils/loaders.py
+++ b/src/entropice/dashboard/utils/loaders.py
@ -17,6 +17,7 @@ from shapely.geometry import shape
 import entropice.spatial.grids
 import entropice.utils.paths
 from entropice.dashboard.utils.formatters import TrainingResultDisplayInfo
+from entropice.ml.autogluon_training import AutoGluonTrainingSettings
 from entropice.ml.dataset import DatasetEnsemble, TrainingSet
 from entropice.ml.training import TrainingSettings
 from entropice.utils.types import GridConfig, TargetDataset, Task, all_target_datasets, all_tasks
@ -215,14 +216,18 @@ class TrainingResult:
        return pd.DataFrame.from_records(records)


-@st.cache_data
+@st.cache_data(ttl=300)  # Cache for 5 minutes
 def load_all_training_results() -> list[TrainingResult]:
    """Load all training results from the results directory."""
    results_dir = entropice.utils.paths.RESULTS_DIR
    training_results: list[TrainingResult] = []
+    incomplete_results: list[tuple[Path, Exception]] = []
    for result_path in results_dir.iterdir():
        if not result_path.is_dir():
            continue
+        # Skip AutoGluon results directory
+        if "autogluon" in result_path.name.lower():
+            continue
        try:
            training_result = TrainingResult.from_path(result_path)
            training_results.append(training_result)
@ -237,10 +242,159 @@ def load_all_training_results() -> list[TrainingResult]:
                    training_results.append(training_result)
                    is_experiment_dir = True
                except FileNotFoundError as e2:
-                    st.warning(f"Skipping incomplete training result: {e2}")
+                    incomplete_results.append((experiment_path, e2))
            if not is_experiment_dir:
-                st.warning(f"Skipping incomplete training result: {e}")
+                incomplete_results.append((result_path, e))

+    if len(incomplete_results) > 0:
+        st.warning(
+            f"Found {len(incomplete_results)} incomplete training results that were skipped:\n - "
+            + "\n - ".join(f"{p}: {e}" for p, e in incomplete_results)
+        )
+    # Sort by creation time (most recent first)
+    training_results.sort(key=lambda tr: tr.created_at, reverse=True)
+    return training_results
+
+
+@dataclass
+class AutogluonTrainingResult:
+    """Wrapper for training result data and metadata."""
+
+    path: Path
+    experiment: str
+    settings: AutoGluonTrainingSettings
+    test_metrics: dict[str, float | dict | pd.DataFrame]
+    leaderboard: pd.DataFrame
+    feature_importance: pd.DataFrame | None
+    created_at: float
+    files: list[Path]
+
+    @classmethod
+    def from_path(cls, result_path: Path, experiment_name: str | None = None) -> "AutogluonTrainingResult":
+        """Load an AutogluonTrainingResult from a given result directory path."""
+        settings_file = result_path / "training_settings.toml"
+        metrics_file = result_path / "test_metrics.pickle"
+        leaderboard_file = result_path / "leaderboard.parquet"
+        feature_importance_file = result_path / "feature_importance.parquet"
+        all_files = list(result_path.iterdir())
+        if not settings_file.exists():
+            raise FileNotFoundError(f"Missing settings file in {result_path}")
+        if not metrics_file.exists():
+            raise FileNotFoundError(f"Missing metrics file in {result_path}")
+        if not leaderboard_file.exists():
+            raise FileNotFoundError(f"Missing leaderboard file in {result_path}")
+
+        created_at = result_path.stat().st_ctime
+        settings_dict = toml.load(settings_file)["settings"]
+        settings = AutoGluonTrainingSettings(**settings_dict)
+        with open(metrics_file, "rb") as f:
+            metrics = pickle.load(f)
+        leaderboard = pd.read_parquet(leaderboard_file)
+
+        if feature_importance_file.exists():
+            feature_importance = pd.read_parquet(feature_importance_file)
+        else:
+            feature_importance = None
+
+        return cls(
+            path=result_path,
+            experiment=experiment_name or "N/A",
+            settings=settings,
+            test_metrics=metrics,
+            leaderboard=leaderboard,
+            feature_importance=feature_importance,
+            created_at=created_at,
+            files=all_files,
+        )
+
+    @property
+    def test_confusion_matrix(self) -> pd.DataFrame | None:
+        """Get the test confusion matrix."""
+        if "confusion_matrix" not in self.test_metrics:
+            return None
+        assert isinstance(self.test_metrics["confusion_matrix"], pd.DataFrame)
+        return self.test_metrics["confusion_matrix"]
+
+    @property
+    def display_info(self) -> TrainingResultDisplayInfo:
+        """Get display information for the training result."""
+        return TrainingResultDisplayInfo(
+            task=self.settings.task,
+            target=self.settings.target,
+            model="autogluon",
+            grid=self.settings.grid,
+            level=self.settings.level,
+            timestamp=datetime.fromtimestamp(self.created_at),
+        )
+
+    def _get_best_metric_name(self) -> str:
+        """Get the primary metric name for a given task."""
+        match self.settings.task:
+            case "binary":
+                return "f1"
+            case "count_regimes" | "density_regimes":
+                return "f1_weighted"
+            case _:  # regression tasks
+                return "root_mean_squared_error"
+
+    @staticmethod
+    def to_dataframe(training_results: list["AutogluonTrainingResult"]) -> pd.DataFrame:
+        """Convert a list of AutogluonTrainingResult objects to a DataFrame for display."""
+        records = []
+        for tr in training_results:
+            info = tr.display_info
+            best_metric_name = tr._get_best_metric_name()
+
+            record = {
+                "Experiment": tr.experiment if tr.experiment else "N/A",
+                "Task": info.task,
+                "Target": info.target,
+                "Model": info.model,
+                "Grid": GridConfig.from_grid_level((info.grid, info.level)).display_name,
+                "Created At": info.timestamp.strftime("%Y-%m-%d %H:%M"),
+                "Score-Metric": best_metric_name.title(),
+                "Best Models Score (Test-Set)": tr.test_metrics.get(best_metric_name),
+                "Path": str(tr.path.name),
+            }
+            records.append(record)
+        return pd.DataFrame.from_records(records)
+
+
+@st.cache_data(ttl=300)  # Cache for 5 minutes
+def load_all_autogluon_training_results() -> list[AutogluonTrainingResult]:
+    """Load all training results from the results directory."""
+    results_dir = entropice.utils.paths.RESULTS_DIR
+    training_results: list[AutogluonTrainingResult] = []
+    incomplete_results: list[tuple[Path, Exception]] = []
+    for result_path in results_dir.iterdir():
+        if not result_path.is_dir():
+            continue
+        # Skip AutoGluon results directory
+        if "autogluon" not in result_path.name.lower():
+            continue
+        try:
+            training_result = AutogluonTrainingResult.from_path(result_path)
+            training_results.append(training_result)
+        except FileNotFoundError as e:
+            is_experiment_dir = False
+            for experiment_path in result_path.iterdir():
+                if not experiment_path.is_dir():
+                    continue
+                try:
+                    experiment_name = experiment_path.parent.name
+                    training_result = AutogluonTrainingResult.from_path(experiment_path, experiment_name)
+                    training_results.append(training_result)
+                    is_experiment_dir = True
+                except FileNotFoundError as e2:
+                    incomplete_results.append((experiment_path, e2))
+            if not is_experiment_dir:
+                incomplete_results.append((result_path, e))
+
+    if len(incomplete_results) > 0:
+        st.warning(
+            f"Found {len(incomplete_results)} incomplete autogluon training results that were skipped:\n - "
+            + "\n - ".join(f"{p}: {e}" for p, e in incomplete_results)
+        )
    # Sort by creation time (most recent first)
    training_results.sort(key=lambda tr: tr.created_at, reverse=True)
    return training_results
--- a/src/entropice/dashboard/views/model_state_page.py
+++ b/src/entropice/dashboard/views/model_state_page.py
@ -369,6 +369,7 @@ def render_xgboost_model_state(model_state: xr.Dataset, selected_result: Trainin
        options=["gain", "weight", "cover", "total_gain", "total_cover"],
        index=0,
        help="Choose which importance metric to visualize",
+        key="model_state_importance_type",
    )

    # Top N slider
--- a/src/entropice/dashboard/views/overview_page.py
+++ b/src/entropice/dashboard/views/overview_page.py
@ -9,7 +9,7 @@ from entropice.dashboard.sections.experiment_results import (
    render_training_results_summary,
 )
 from entropice.dashboard.sections.storage_statistics import render_storage_statistics
-from entropice.dashboard.utils.loaders import load_all_training_results
+from entropice.dashboard.utils.loaders import load_all_autogluon_training_results, load_all_training_results
 from entropice.dashboard.utils.stats import DatasetStatistics, load_all_default_dataset_statistics


@ -27,6 +27,9 @@ def render_overview_page():
    )
    # Load training results
    training_results = load_all_training_results()
+    autogluon_results = load_all_autogluon_training_results()
+    if len(autogluon_results) > 0:
+        training_results.extend(autogluon_results)

    if not training_results:
        st.warning("No training results found. Please run some training experiments first.")
--- a/src/entropice/dashboard/views/training_analysis_page.py
+++ b/src/entropice/dashboard/views/training_analysis_page.py
@ -2,150 +2,22 @@

 from typing import cast

-import geopandas as gpd
 import streamlit as st
-import xarray as xr
-from stopuhr import stopwatch

-from entropice.dashboard.plots.hyperparameter_analysis import (
-    render_binned_parameter_space,
-    render_confusion_matrix_heatmap,
-    render_confusion_matrix_map,
-    render_espa_binned_parameter_space,
-    render_multi_metric_comparison,
-    render_parameter_correlation,
-    render_parameter_distributions,
-    render_performance_summary,
-    render_top_configurations,
+from entropice.dashboard.sections.cv_result import (
+    render_confusion_matrices,
+    render_cv_statistics_section,
+    render_metrics_section,
+    render_run_information,
 )
+from entropice.dashboard.sections.hparam_space import render_hparam_space_section
+from entropice.dashboard.sections.regression_analysis import render_regression_analysis
 from entropice.dashboard.utils.formatters import format_metric_name
 from entropice.dashboard.utils.loaders import TrainingResult, load_all_training_results
-from entropice.dashboard.utils.stats import CVResultsStatistics
-from entropice.utils.types import GridConfig
+from entropice.dashboard.utils.stats import CVMetricStatistics


-def load_predictions_with_labels(selected_result: TrainingResult) -> gpd.GeoDataFrame | None:
-    """Load predictions and merge with training data to get true labels and split info.
-
-    Args:
-        selected_result: The selected TrainingResult object.
-
-    Returns:
-        GeoDataFrame with predictions, true labels, and split information, or None if unavailable.
-
-    """
-    from sklearn.model_selection import train_test_split
-
-    from entropice.ml.dataset import DatasetEnsemble, bin_values, taskcol
-
-    # Load predictions
-    preds_gdf = selected_result.load_predictions()
-    if preds_gdf is None:
-        return None
-
-    # Create a minimal dataset ensemble to access target data
-    settings = selected_result.settings
-    dataset_ensemble = DatasetEnsemble(
-        grid=settings.grid,
-        level=settings.level,
-        target=settings.target,
-        members=[],  # No feature data needed, just targets
-    )
-
-    # Load target dataset (just labels, no features)
-    with st.spinner("Loading target labels..."):
-        targets = dataset_ensemble._read_target()
-
-    # Get coverage and task columns
-    task_col = taskcol[settings.task][settings.target]
-
-    # Filter for valid labels (same as in _cat_and_split)
-    valid_labels = targets[task_col].notna()
-    filtered_targets = targets.loc[valid_labels].copy()
-
-    # Apply binning to get class labels (same logic as _cat_and_split)
-    if settings.task == "binary":
-        binned = filtered_targets[task_col].map({False: "No RTS", True: "RTS"}).astype("category")
-    elif settings.task == "count":
-        binned = bin_values(filtered_targets[task_col].astype(int), task=settings.task)
-    elif settings.task == "density":
-        binned = bin_values(filtered_targets[task_col], task=settings.task)
-    else:
-        raise ValueError(f"Invalid task: {settings.task}")
-
-    filtered_targets["true_class"] = binned.to_numpy()
-
-    # Recreate the train/test split deterministically (same random_state=42 as in _cat_and_split)
-    _train_idx, test_idx = train_test_split(
-        filtered_targets.index.to_numpy(), test_size=0.2, random_state=42, shuffle=True
-    )
-    filtered_targets["split"] = "train"
-    filtered_targets.loc[test_idx, "split"] = "test"
-    filtered_targets["split"] = filtered_targets["split"].astype("category")
-
-    # Ensure cell_id is available as a column for merging
-    # Check if cell_id already exists, otherwise use the index
-    if "cell_id" not in filtered_targets.columns:
-        filtered_targets = filtered_targets.reset_index().rename(columns={"index": "cell_id"})
-
-    # Merge predictions with labels (inner join to keep only cells with predictions)
-    merged = filtered_targets.merge(preds_gdf[["cell_id", "predicted_class"]], on="cell_id", how="inner")
-    merged_gdf = gpd.GeoDataFrame(merged, geometry="geometry", crs=targets.crs)
-
-    return merged_gdf
-
-
-def compute_confusion_matrix_from_merged_data(
-    merged_data: gpd.GeoDataFrame,
-    split_type: str,
-    label_names: list[str],
-) -> xr.DataArray | None:
-    """Compute confusion matrix from merged predictions and labels.
-
-    Args:
-        merged_data: GeoDataFrame with 'true_class', 'predicted_class', and 'split' columns.
-        split_type: One of 'test', 'train', or 'all'.
-        label_names: List of class label names in order.
-
-    Returns:
-        xarray.DataArray with confusion matrix or None if data unavailable.
-
-    """
-    from sklearn.metrics import confusion_matrix
-
-    # Filter by split type
-    if split_type == "train":
-        data = merged_data[merged_data["split"] == "train"]
-    elif split_type == "test":
-        data = merged_data[merged_data["split"] == "test"]
-    elif split_type == "all":
-        data = merged_data
-    else:
-        raise ValueError(f"Invalid split_type: {split_type}")
-
-    if len(data) == 0:
-        st.warning(f"No data available for {split_type} split.")
-        return None
-
-    # Get true and predicted labels
-    y_true = data["true_class"].to_numpy()
-    y_pred = data["predicted_class"].to_numpy()
-
-    # Compute confusion matrix
-    cm = confusion_matrix(y_true, y_pred, labels=label_names)
-
-    # Create xarray DataArray
-    cm_xr = xr.DataArray(
-        cm,
-        dims=["true_label", "predicted_label"],
-        coords={"true_label": label_names, "predicted_label": label_names},
-        name="confusion_matrix",
-    )
-
-    return cm_xr
-
-
-def render_analysis_settings_sidebar(training_results: list[TrainingResult]) -> tuple[TrainingResult, str, str, int]:
+def render_analysis_settings_sidebar(training_results: list[TrainingResult]) -> tuple[TrainingResult, str, str]:
    """Render sidebar for training run and analysis settings selection.

    Args:
@ -155,6 +27,7 @@ def render_analysis_settings_sidebar(training_results: list[TrainingResult]) ->
        Tuple of (selected_result, selected_metric, refit_metric, top_n).

    """
+    with st.sidebar.form("training_analysis_settings_form"):
        st.header("Select Training Run")

        # Create selection options with task-first naming
@ -178,7 +51,12 @@ def render_analysis_settings_sidebar(training_results: list[TrainingResult]) ->
        available_metrics = selected_result.available_metrics

        # Try to get refit metric from settings
-    refit_metric = "f1" if selected_result.settings.task == "binary" else "f1_weighted"
+        if selected_result.settings.task == "binary":
+            refit_metric = "f1"
+        elif selected_result.settings.task in ["count_regimes", "density_regimes"]:
+            refit_metric = "f1_weighted"
+        else:
+            refit_metric = "r2"

        if refit_metric in available_metrics:
            default_metric_idx = available_metrics.index(refit_metric)
@ -194,312 +72,18 @@ def render_analysis_settings_sidebar(training_results: list[TrainingResult]) ->
            key="metric_select",
        )

-    # Top N configurations
-    top_n = st.slider(
-        "Top N Configurations",
-        min_value=5,
-        max_value=50,
-        value=10,
-        step=5,
-        help="Number of top configurations to display",
-        key="top_n_slider",
+        # Form submit button
+        submitted = st.form_submit_button(
+            "Load Training Result",
+            type="primary",
+            use_container_width=True,
        )

-    return selected_result, selected_metric, refit_metric, top_n
+        if not submitted:
+            st.info("👆 Click 'Load Training Result' to apply changes.")
+            st.stop()

-
-def render_run_information(selected_result: TrainingResult, refit_metric):
-    """Render training run configuration overview.
-
-    Args:
-        selected_result: The selected TrainingResult object.
-        refit_metric: The refit metric used for model selection.
-
-    """
-    st.header("📋 Run Information")
-
-    grid_config = GridConfig.from_grid_level(f"{selected_result.settings.grid}{selected_result.settings.level}")  # ty:ignore[invalid-argument-type]
-
-    col1, col2, col3, col4, col5 = st.columns(5)
-    with col1:
-        st.metric("Task", selected_result.settings.task.capitalize())
-    with col2:
-        st.metric("Target", selected_result.settings.target.capitalize())
-    with col3:
-        st.metric("Grid", grid_config.display_name)
-    with col4:
-        st.metric("Model", selected_result.settings.model.upper())
-    with col5:
-        st.metric("Trials", len(selected_result.results))
-
-    st.caption(f"**Refit Metric:** {format_metric_name(refit_metric)}")
-
-
-def render_test_metrics_section(selected_result: TrainingResult):
-    """Render test metrics overview showing final model performance.
-
-    Args:
-        selected_result: The selected TrainingResult object.
-
-    """
-    st.header("🎯 Test Set Performance")
-    st.caption("Performance metrics on the held-out test set (best model from hyperparameter search)")
-
-    test_metrics = selected_result.metrics
-
-    if not test_metrics:
-        st.warning("No test metrics available for this training run.")
-        return
-
-    # Display metrics in columns based on task type
-    task = selected_result.settings.task
-
-    if task == "binary":
-        # Binary classification metrics
-        col1, col2, col3, col4, col5 = st.columns(5)
-
-        with col1:
-            st.metric("Accuracy", f"{test_metrics.get('accuracy', 0):.4f}")
-        with col2:
-            st.metric("F1 Score", f"{test_metrics.get('f1', 0):.4f}")
-        with col3:
-            st.metric("Precision", f"{test_metrics.get('precision', 0):.4f}")
-        with col4:
-            st.metric("Recall", f"{test_metrics.get('recall', 0):.4f}")
-        with col5:
-            st.metric("Jaccard", f"{test_metrics.get('jaccard', 0):.4f}")
-    else:
-        # Multiclass metrics
-        col1, col2, col3 = st.columns(3)
-
-        with col1:
-            st.metric("Accuracy", f"{test_metrics.get('accuracy', 0):.4f}")
-        with col2:
-            st.metric("F1 (Macro)", f"{test_metrics.get('f1_macro', 0):.4f}")
-        with col3:
-            st.metric("F1 (Weighted)", f"{test_metrics.get('f1_weighted', 0):.4f}")
-
-        col4, col5, col6 = st.columns(3)
-
-        with col4:
-            st.metric("Precision (Macro)", f"{test_metrics.get('precision_macro', 0):.4f}")
-        with col5:
-            st.metric("Precision (Weighted)", f"{test_metrics.get('precision_weighted', 0):.4f}")
-        with col6:
-            st.metric("Recall (Macro)", f"{test_metrics.get('recall_macro', 0):.4f}")
-
-        col7, col8, col9 = st.columns(3)
-
-        with col7:
-            st.metric("Jaccard (Micro)", f"{test_metrics.get('jaccard_micro', 0):.4f}")
-        with col8:
-            st.metric("Jaccard (Macro)", f"{test_metrics.get('jaccard_macro', 0):.4f}")
-        with col9:
-            st.metric("Jaccard (Weighted)", f"{test_metrics.get('jaccard_weighted', 0):.4f}")
-
-
-def render_cv_statistics_section(selected_result, selected_metric):
-    """Render cross-validation statistics for selected metric.
-
-    Args:
-        selected_result: The selected TrainingResult object.
-        selected_metric: The metric to display statistics for.
-
-    """
-    st.header("📈 Cross-Validation Statistics")
-    st.caption("Performance during hyperparameter search (averaged across CV folds)")
-
-    from entropice.dashboard.utils.stats import CVMetricStatistics
-
-    cv_stats = CVMetricStatistics.compute(selected_result, selected_metric)
-
-    col1, col2, col3, col4, col5 = st.columns(5)
-
-    with col1:
-        st.metric("Best Score", f"{cv_stats.best_score:.4f}")
-
-    with col2:
-        st.metric("Mean Score", f"{cv_stats.mean_score:.4f}")
-
-    with col3:
-        st.metric("Std Dev", f"{cv_stats.std_score:.4f}")
-
-    with col4:
-        st.metric("Worst Score", f"{cv_stats.worst_score:.4f}")
-
-    with col5:
-        st.metric("Median Score", f"{cv_stats.median_score:.4f}")
-
-    if cv_stats.mean_cv_std is not None:
-        st.info(f"**Mean CV Std:** {cv_stats.mean_cv_std:.4f} - Average standard deviation across CV folds")
-
-    # Compare with test metric if available
-    if selected_metric in selected_result.metrics:
-        test_score = selected_result.metrics[selected_metric]
-        st.divider()
-        st.subheader("CV vs Test Performance")
-
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("Best CV Score", f"{cv_stats.best_score:.4f}")
-        with col2:
-            st.metric("Test Score", f"{test_score:.4f}")
-        with col3:
-            delta = test_score - cv_stats.best_score
-            delta_pct = (delta / cv_stats.best_score * 100) if cv_stats.best_score != 0 else 0
-            st.metric("Difference", f"{delta:+.4f}", delta=f"{delta_pct:+.2f}%")
-
-        if abs(delta) > cv_stats.std_score:
-            st.warning(
-                "⚠️ Test performance differs significantly from CV performance. "
-                "This may indicate overfitting or data distribution mismatch."
-            )
-
-
-@st.fragment
-def render_confusion_matrix_section(selected_result: TrainingResult, merged_predictions: gpd.GeoDataFrame | None):
-    """Render confusion matrix visualization and analysis.
-
-    Args:
-        selected_result: The selected TrainingResult object.
-        merged_predictions: GeoDataFrame with predictions merged with true labels and split info.
-
-    """
-    st.header("🎲 Confusion Matrix")
-    st.caption("Detailed breakdown of predictions")
-
-    # Add selector for confusion matrix type
-    cm_type = st.selectbox(
-        "Select Data Split",
-        options=["test", "train", "all"],
-        format_func=lambda x: {"test": "Test Set", "train": "CV Set (Train Split)", "all": "All Available Data"}[x],
-        help="Choose which data split to display the confusion matrix for",
-        key="cm_split_select",
-    )
-
-    # Get label names from settings
-    label_names = selected_result.settings.classes
-
-    # Compute or load confusion matrix based on selection
-    if cm_type == "test":
-        if selected_result.confusion_matrix is None:
-            st.warning("No confusion matrix available for the test set.")
-            return
-        cm = selected_result.confusion_matrix
-        st.info("📊 Showing confusion matrix for the **Test Set** (held-out data, never used during training)")
-    else:
-        if merged_predictions is None:
-            st.warning("Predictions data not available. Cannot compute confusion matrix.")
-            return
-
-        with st.spinner(f"Computing confusion matrix for {cm_type} split..."):
-            cm = compute_confusion_matrix_from_merged_data(merged_predictions, cm_type, label_names)
-        if cm is None:
-            return
-
-        if cm_type == "train":
-            st.info(
-                "📊 Showing confusion matrix for the **CV Set (Train Split)** "
-                "(data used during hyperparameter search cross-validation)"
-            )
-        else:  # all
-            st.info("📊 Showing confusion matrix for **All Available Data** (combined train and test splits)")
-
-    render_confusion_matrix_heatmap(cm, selected_result.settings.task)
-
-
-def render_parameter_space_section(selected_result, selected_metric):
-    """Render parameter space analysis section.
-
-    Args:
-        selected_result: The selected TrainingResult object.
-        selected_metric: The metric to analyze parameters against.
-
-    """
-    st.header("🔍 Parameter Space Analysis")
-
-    # Compute CV results statistics
-    cv_results_stats = CVResultsStatistics.compute(selected_result)
-
-    # Show parameter space summary
-    with st.expander("📋 Parameter Space Summary", expanded=False):
-        param_summary_df = cv_results_stats.parameters_to_dataframe()
-        if not param_summary_df.empty:
-            st.dataframe(param_summary_df, hide_index=True, width="stretch")
-        else:
-            st.info("No parameter information available.")
-
-    results = selected_result.results
-    settings = selected_result.settings
-
-    # Parameter distributions
-    st.subheader("📈 Parameter Distributions")
-    render_parameter_distributions(results, settings)
-
-    # Binned parameter space plots
-    st.subheader("🎨 Binned Parameter Space")
-
-    # Check if this is an ESPA model and show ESPA-specific plots
-    model_type = settings.model
-    if model_type == "espa":
-        # Show ESPA-specific binned plots (eps_cl vs eps_e binned by K)
-        render_espa_binned_parameter_space(results, selected_metric)
-
-        # Optionally show the generic binned plots in an expander
-        with st.expander("📊 All Parameter Combinations", expanded=False):
-            st.caption("Generic parameter space exploration (all pairwise combinations)")
-            render_binned_parameter_space(results, selected_metric)
-    else:
-        # For non-ESPA models, show the generic binned plots
-        render_binned_parameter_space(results, selected_metric)
-
-
-def render_data_export_section(results, selected_result):
-    """Render data export section with download buttons.
-
-    Args:
-        results: DataFrame with CV results.
-        selected_result: The selected TrainingResult object.
-
-    """
-    with st.expander("💾 Export Data", expanded=False):
-        st.subheader("Download Results")
-
-        col1, col2 = st.columns(2)
-
-        with col1:
-            # Download full results as CSV
-            csv_data = results.to_csv(index=False)
-            st.download_button(
-                label="📥 Download Full Results (CSV)",
-                data=csv_data,
-                file_name=f"{selected_result.path.name}_results.csv",
-                mime="text/csv",
-            )
-
-        with col2:
-            # Download settings as JSON
-            import json
-
-            settings_dict = {
-                "task": selected_result.settings.task,
-                "grid": selected_result.settings.grid,
-                "level": selected_result.settings.level,
-                "model": selected_result.settings.model,
-                "cv_splits": selected_result.settings.cv_splits,
-                "classes": selected_result.settings.classes,
-            }
-            settings_json = json.dumps(settings_dict, indent=2)
-            st.download_button(
-                label="⚙️ Download Settings (JSON)",
-                data=settings_json,
-                file_name=f"{selected_result.path.name}_settings.json",
-                mime="application/json",
-            )
-
-        # Show raw data preview
-        st.subheader("Raw Data Preview")
-        st.dataframe(results.head(100), width="stretch")
+    return selected_result, selected_metric, refit_metric


 def render_training_analysis_page():
@ -513,91 +97,47 @@ def render_training_analysis_page():
        """
    )

-    # Load all available training results
+    # Load training results
    training_results = load_all_training_results()

    if not training_results:
        st.warning("No training results found. Please run some training experiments first.")
-        st.info("Run training using: `pixi run python -m entropice.ml.training`")
+        st.stop()
        return

-    st.success(f"Found **{len(training_results)}** training result(s)")
+    st.write(f"Found **{len(training_results)}** training result(s)")

    st.divider()
+    selected_result, selected_metric, refit_metric = render_analysis_settings_sidebar(training_results)

-    # Sidebar: Training run selection
-    with st.sidebar:
-        selection_result = render_analysis_settings_sidebar(training_results)
-        if selection_result[0] is None:
-            return
-        selected_result, selected_metric, refit_metric, top_n = selection_result
+    cv_statistics = CVMetricStatistics.compute(selected_result, selected_metric)

-    # Load predictions with labels once (used by confusion matrix and map)
-    merged_predictions = load_predictions_with_labels(selected_result)
-
-    # Main content area
-    results = selected_result.results
-    settings = selected_result.settings
-
-    # Run Information
    render_run_information(selected_result, refit_metric)

    st.divider()

-    # Test Metrics Section
-    render_test_metrics_section(selected_result)
+    render_metrics_section(selected_result)

    st.divider()

-    # Confusion Matrix Section
-    render_confusion_matrix_section(selected_result, merged_predictions)
+    # Render confusion matrices for classification, regression analysis for regression
+    if selected_result.settings.task in ["binary", "count_regimes", "density_regimes"]:
+        render_confusion_matrices(selected_result)
+    else:
+        render_regression_analysis(selected_result)

    st.divider()

-    # Performance Summary Section
-    st.header("📊 CV Performance Overview")
-    st.caption("Summary of hyperparameter search results across all configurations")
-    render_performance_summary(results, refit_metric)
+    render_cv_statistics_section(cv_statistics, selected_result.test_metrics.get(selected_metric, float("nan")))

    st.divider()

-    # Prediction Analysis Map Section
-    st.header("🗺️ Model Performance Map")
-    st.caption("Interactive 3D map showing prediction correctness across the training dataset")
-    render_confusion_matrix_map(selected_result.path, settings, merged_predictions)
+    render_hparam_space_section(selected_result, selected_metric)

    st.divider()

-    # Cross-Validation Statistics
-    render_cv_statistics_section(selected_result, selected_metric)
-
-    st.divider()
-
-    # Parameter Space Analysis
-    render_parameter_space_section(selected_result, selected_metric)
-
-    st.divider()
-
-    # Parameter Correlation
-    st.header("🔗 Parameter Correlation")
-    render_parameter_correlation(results, selected_metric)
-
-    st.divider()
-
-    # Multi-Metric Comparison
-    if len(selected_result.available_metrics) >= 2:
-        st.header("📊 Multi-Metric Comparison")
-        render_multi_metric_comparison(results)
-        st.divider()
-
-    # Top Configurations
-    st.header("🏆 Top Performing Configurations")
-    render_top_configurations(results, selected_metric, top_n)
-
-    st.divider()
-
-    # Raw Data Export
-    render_data_export_section(results, selected_result)
+    # List all results at the end
+    st.header("📄 All Training Results")
+    st.dataframe(selected_result.results)

    st.balloons()
-    stopwatch.summary()
--- a/src/entropice/ml/autogluon_training.py
+++ b/src/entropice/ml/autogluon_training.py
@ -44,8 +44,8 @@ class AutoGluonSettings:
 class AutoGluonTrainingSettings(DatasetEnsemble, AutoGluonSettings):
    """Combined settings for AutoGluon training."""

-    classes: list[str] | None
-    problem_type: str
+    classes: list[str] | None = None
+    problem_type: str = "binary"


 def _determine_problem_type_and_metric(task: Task) -> tuple[str, str]:
@ -177,6 +177,8 @@ def autogluon_train(
        toml.dump({"settings": asdict(combined_settings)}, f)

    # Save test metrics
+    # We need to use pickle here, because the confusion matrix is stored as a dataframe
+    # This only matters for classification tasks
    test_metrics_file = results_dir / "test_metrics.pickle"
    print(f"💾 Saving test metrics to {test_metrics_file}")
    with open(test_metrics_file, "wb") as f: