From ccd40ace483065f8f0d4a5c37dbe50c2271fbb96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20H=C3=B6lzer?= <tobiashoelzer@hotmail.com>
Date: Fri, 7 Nov 2025 16:21:02 +0100
Subject: [PATCH] Add an analysis dashboard with streamlit

---
 src/entropice/training.py                    |   2 +-
 src/entropice/training_analysis_dashboard.py | 236 +++++++++++++++++++
 2 files changed, 237 insertions(+), 1 deletion(-)
 create mode 100644 src/entropice/training_analysis_dashboard.py

diff --git a/src/entropice/training.py b/src/entropice/training.py
index 27ef601..9caf913 100644
--- a/src/entropice/training.py
+++ b/src/entropice/training.py
@@ -259,7 +259,7 @@ def plot_random_cv_results(file: Path):
     figdir = file.parent
 
     # K-Plots
-    metrics = ["accuracy", "recall", "precision", "f1", "jaccard"]
+    metrics = ["f1"]
     for metric in metrics:
         _plot_k_binned(
             results,
diff --git a/src/entropice/training_analysis_dashboard.py b/src/entropice/training_analysis_dashboard.py
new file mode 100644
index 0000000..b1fa78a
--- /dev/null
+++ b/src/entropice/training_analysis_dashboard.py
@@ -0,0 +1,236 @@
+"""Streamlit dashboard for training analysis results visualization."""
+
+from pathlib import Path
+
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import streamlit as st
+
+from entropice.paths import RESULTS_DIR
+
+sns.set_theme("talk", "whitegrid")
+
+
+def _plot_k_binned(
+    results: pd.DataFrame,
+    target: str,
+    *,
+    vmin_percentile: float | None = None,
+    vmax_percentile: float | None = None,
+):
+    """Plot K-binned results with epsilon parameters."""
+    assert vmin_percentile is None or vmax_percentile is None, (
+        "Only one of vmin_percentile or vmax_percentile can be set."
+    )
+    assert "initial_K_binned" in results.columns, "initial_K_binned column not found in results."
+    assert target in results.columns, f"{target} column not found in results."
+    assert "eps_e" in results.columns, "eps_e column not found in results."
+    assert "eps_cl" in results.columns, "eps_cl column not found in results."
+
+    # add a colorbar instead of the sampled legend
+    cmap = sns.color_palette("ch:", as_cmap=True)
+    # sophisticated normalization
+    if vmin_percentile is not None:
+        vmin = np.percentile(results[target], vmin_percentile)
+        norm = mcolors.Normalize(vmin=vmin)
+    elif vmax_percentile is not None:
+        vmax = np.percentile(results[target], vmax_percentile)
+        norm = mcolors.Normalize(vmax=vmax)
+    else:
+        norm = mcolors.Normalize()
+    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
+
+    # nice col-wrap based on columns
+    n_cols = results["initial_K_binned"].unique().size
+    col_wrap = 5 if n_cols % 5 == 0 else (4 if n_cols % 4 == 0 else 3)
+
+    scatter = sns.relplot(
+        data=results,
+        x="eps_e",
+        y="eps_cl",
+        hue=target,
+        hue_norm=sm.norm,
+        palette=cmap,
+        legend=False,
+        col="initial_K_binned",
+        col_wrap=col_wrap,
+    )
+
+    # Apply log scale to all axes
+    for ax in scatter.axes.flat:
+        ax.set_xscale("log")
+        ax.set_yscale("log")
+
+    # Tight layout
+    scatter.figure.tight_layout()
+
+    # Add a shared colorbar at the bottom
+    scatter.figure.subplots_adjust(bottom=0.15)  # Make room for the colorbar
+    cbar_ax = scatter.figure.add_axes([0.15, 0.05, 0.7, 0.02])  # [left, bottom, width, height]
+    cbar = scatter.figure.colorbar(sm, cax=cbar_ax, orientation="horizontal")
+    cbar.set_label(target)
+
+    return scatter
+
+
+def _plot_eps_binned(results: pd.DataFrame, target: str, metric: str):
+    """Plot epsilon-binned results with K parameter."""
+    assert "initial_K" in results.columns, "initial_K column not found in results."
+    assert metric in results.columns, f"{metric} not found in results."
+
+    if target == "eps_cl":
+        hue = "eps_cl"
+        col = "eps_e_binned"
+    elif target == "eps_e":
+        hue = "eps_e"
+        col = "eps_cl_binned"
+    else:
+        raise ValueError(f"Invalid target: {target}")
+
+    assert hue in results.columns, f"{hue} column not found in results."
+    assert col in results.columns, f"{col} column not found in results."
+
+    return sns.relplot(results, x="initial_K", y=metric, hue=hue, col=col, col_wrap=5, hue_norm=mcolors.LogNorm())
+
+
+def load_and_prepare_results(file_path: Path) -> pd.DataFrame:
+    """Load results file and prepare binned columns."""
+    results = pd.read_parquet(file_path)
+
+    # Bin the initial_K into 40er bins
+    results["initial_K_binned"] = pd.cut(results["initial_K"], bins=range(20, 401, 40), right=False)
+
+    # Bin the eps_cl and eps_e into logarithmic bins
+    eps_cl_bins = np.logspace(-3, 7, num=10)
+    eps_e_bins = np.logspace(-3, 7, num=10)
+    results["eps_cl_binned"] = pd.cut(results["eps_cl"], bins=eps_cl_bins)
+    results["eps_e_binned"] = pd.cut(results["eps_e"], bins=eps_e_bins)
+
+    return results
+
+
+def get_available_result_files() -> list[Path]:
+    """Get all available result files from RESULTS_DIR."""
+    if not RESULTS_DIR.exists():
+        return []
+
+    result_files = []
+    for search_dir in RESULTS_DIR.iterdir():
+        if search_dir.is_dir():
+            result_file = search_dir / "search_results.parquet"
+            if result_file.exists():
+                result_files.append(result_file)
+
+    return sorted(result_files, reverse=True)  # Most recent first
+
+
+def main():
+    """Run Streamlit dashboard application."""
+    st.set_page_config(page_title="Training Analysis Dashboard", layout="wide")
+
+    st.title("Training Analysis Dashboard")
+    st.markdown("Interactive visualization of RandomizedSearchCV results")
+
+    # Sidebar for file and parameter selection
+    st.sidebar.header("Configuration")
+
+    # Get available result files
+    result_files = get_available_result_files()
+
+    if not result_files:
+        st.error(f"No result files found in {RESULTS_DIR}")
+        st.info("Please run a random CV search first to generate results.")
+        return
+
+    # File selection
+    file_options = {str(f.parent.name): f for f in result_files}
+    selected_file_name = st.sidebar.selectbox(
+        "Select Result File", options=list(file_options.keys()), help="Choose a search result file to visualize"
+    )
+    selected_file = file_options[selected_file_name]
+
+    # Load and prepare data
+    with st.spinner("Loading results..."):
+        results = load_and_prepare_results(selected_file)
+
+    st.sidebar.success(f"Loaded {len(results)} results")
+
+    # Metric selection
+    available_metrics = ["accuracy", "recall", "precision", "f1", "jaccard"]
+    selected_metric = st.sidebar.selectbox(
+        "Select Metric", options=available_metrics, help="Choose which metric to visualize"
+    )
+
+    # Percentile normalization option
+    use_percentile = st.sidebar.checkbox(
+        "Use Percentile Normalization", value=True, help="Apply percentile-based color normalization to plots"
+    )
+
+    # Display some basic statistics
+    st.header("Dataset Overview")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Total Runs", len(results))
+    with col2:
+        best_score = results[f"mean_test_{selected_metric}"].max()
+        st.metric(f"Best {selected_metric.capitalize()}", f"{best_score:.4f}")
+    with col3:
+        best_idx = results[f"mean_test_{selected_metric}"].idxmax()
+        best_k = results.loc[best_idx, "initial_K"]
+        st.metric("Best K", f"{best_k:.0f}")
+
+    # Show best parameters
+    with st.expander("Best Parameters"):
+        best_idx = results[f"mean_test_{selected_metric}"].idxmax()
+        best_params = results.loc[best_idx, ["initial_K", "eps_cl", "eps_e", f"mean_test_{selected_metric}"]]
+        st.dataframe(best_params.to_frame().T, use_container_width=True)
+
+    # Main plots
+    st.header(f"Visualization for {selected_metric.capitalize()}")
+
+    # K-binned plots
+    st.subheader("K-Binned Parameter Space (Mean)")
+    with st.spinner("Generating mean plot..."):
+        if use_percentile:
+            fig1 = _plot_k_binned(results, f"mean_test_{selected_metric}", vmin_percentile=50)
+        else:
+            fig1 = _plot_k_binned(results, f"mean_test_{selected_metric}")
+        st.pyplot(fig1.figure)
+        plt.close()
+
+    st.subheader("K-Binned Parameter Space (Std)")
+    with st.spinner("Generating std plot..."):
+        if use_percentile:
+            fig2 = _plot_k_binned(results, f"std_test_{selected_metric}", vmax_percentile=50)
+        else:
+            fig2 = _plot_k_binned(results, f"std_test_{selected_metric}")
+        st.pyplot(fig2.figure)
+        plt.close()
+
+    # Epsilon-binned plots
+    col1, col2 = st.columns(2)
+
+    with col1:
+        st.subheader("K vs eps_cl")
+        with st.spinner("Generating eps_cl plot..."):
+            fig3 = _plot_eps_binned(results, "eps_cl", f"mean_test_{selected_metric}")
+            st.pyplot(fig3.figure)
+            plt.close()
+
+    with col2:
+        st.subheader("K vs eps_e")
+        with st.spinner("Generating eps_e plot..."):
+            fig4 = _plot_eps_binned(results, "eps_e", f"mean_test_{selected_metric}")
+            st.pyplot(fig4.figure)
+            plt.close()
+
+    # Optional: Raw data table
+    with st.expander("View Raw Results Data"):
+        st.dataframe(results, use_container_width=True)
+
+
+if __name__ == "__main__":
+    main()