Add an analysis dashboard with streamlit

2025-11-07 16:21:02 +01:00 · 2025-11-07 16:21:02 +01:00 · ccd40ace48
commit ccd40ace48
parent 3e0e6e0d2d
2 changed files with 237 additions and 1 deletions
--- a/src/entropice/training.py
+++ b/src/entropice/training.py
@ -259,7 +259,7 @@ def plot_random_cv_results(file: Path):
    figdir = file.parent
    # K-Plots
-    metrics = ["accuracy", "recall", "precision", "f1", "jaccard"]
+    metrics = ["f1"]
    for metric in metrics:
        _plot_k_binned(
            results,
--- a/src/entropice/training_analysis_dashboard.py
+++ b/src/entropice/training_analysis_dashboard.py
@ -0,0 +1,236 @@
 """Streamlit dashboard for training analysis results visualization."""
 from pathlib import Path
 import matplotlib.colors as mcolors
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import streamlit as st
 from entropice.paths import RESULTS_DIR
 sns.set_theme("talk", "whitegrid")
 def _plot_k_binned(
    results: pd.DataFrame,
    target: str,
    *,
    vmin_percentile: float | None = None,
    vmax_percentile: float | None = None,
 ):
    """Plot K-binned results with epsilon parameters."""
    assert vmin_percentile is None or vmax_percentile is None, (
        "Only one of vmin_percentile or vmax_percentile can be set."
    )
    assert "initial_K_binned" in results.columns, "initial_K_binned column not found in results."
    assert target in results.columns, f"{target} column not found in results."
    assert "eps_e" in results.columns, "eps_e column not found in results."
    assert "eps_cl" in results.columns, "eps_cl column not found in results."
    # add a colorbar instead of the sampled legend
    cmap = sns.color_palette("ch:", as_cmap=True)
    # sophisticated normalization
    if vmin_percentile is not None:
        vmin = np.percentile(results[target], vmin_percentile)
        norm = mcolors.Normalize(vmin=vmin)
    elif vmax_percentile is not None:
        vmax = np.percentile(results[target], vmax_percentile)
        norm = mcolors.Normalize(vmax=vmax)
    else:
        norm = mcolors.Normalize()
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    # nice col-wrap based on columns
    n_cols = results["initial_K_binned"].unique().size
    col_wrap = 5 if n_cols % 5 == 0 else (4 if n_cols % 4 == 0 else 3)
    scatter = sns.relplot(
        data=results,
        x="eps_e",
        y="eps_cl",
        hue=target,
        hue_norm=sm.norm,
        palette=cmap,
        legend=False,
        col="initial_K_binned",
        col_wrap=col_wrap,
    )
    # Apply log scale to all axes
    for ax in scatter.axes.flat:
        ax.set_xscale("log")
        ax.set_yscale("log")
    # Tight layout
    scatter.figure.tight_layout()
    # Add a shared colorbar at the bottom
    scatter.figure.subplots_adjust(bottom=0.15)  # Make room for the colorbar
    cbar_ax = scatter.figure.add_axes([0.15, 0.05, 0.7, 0.02])  # [left, bottom, width, height]
    cbar = scatter.figure.colorbar(sm, cax=cbar_ax, orientation="horizontal")
    cbar.set_label(target)
    return scatter
 def _plot_eps_binned(results: pd.DataFrame, target: str, metric: str):
    """Plot epsilon-binned results with K parameter."""
    assert "initial_K" in results.columns, "initial_K column not found in results."
    assert metric in results.columns, f"{metric} not found in results."
    if target == "eps_cl":
        hue = "eps_cl"
        col = "eps_e_binned"
    elif target == "eps_e":
        hue = "eps_e"
        col = "eps_cl_binned"
    else:
        raise ValueError(f"Invalid target: {target}")
    assert hue in results.columns, f"{hue} column not found in results."
    assert col in results.columns, f"{col} column not found in results."
    return sns.relplot(results, x="initial_K", y=metric, hue=hue, col=col, col_wrap=5, hue_norm=mcolors.LogNorm())
 def load_and_prepare_results(file_path: Path) -> pd.DataFrame:
    """Load results file and prepare binned columns."""
    results = pd.read_parquet(file_path)
    # Bin the initial_K into 40er bins
    results["initial_K_binned"] = pd.cut(results["initial_K"], bins=range(20, 401, 40), right=False)
    # Bin the eps_cl and eps_e into logarithmic bins
    eps_cl_bins = np.logspace(-3, 7, num=10)
    eps_e_bins = np.logspace(-3, 7, num=10)
    results["eps_cl_binned"] = pd.cut(results["eps_cl"], bins=eps_cl_bins)
    results["eps_e_binned"] = pd.cut(results["eps_e"], bins=eps_e_bins)
    return results
 def get_available_result_files() -> list[Path]:
    """Get all available result files from RESULTS_DIR."""
    if not RESULTS_DIR.exists():
        return []
    result_files = []
    for search_dir in RESULTS_DIR.iterdir():
        if search_dir.is_dir():
            result_file = search_dir / "search_results.parquet"
            if result_file.exists():
                result_files.append(result_file)
    return sorted(result_files, reverse=True)  # Most recent first
 def main():
    """Run Streamlit dashboard application."""
    st.set_page_config(page_title="Training Analysis Dashboard", layout="wide")
    st.title("Training Analysis Dashboard")
    st.markdown("Interactive visualization of RandomizedSearchCV results")
    # Sidebar for file and parameter selection
    st.sidebar.header("Configuration")
    # Get available result files
    result_files = get_available_result_files()
    if not result_files:
        st.error(f"No result files found in {RESULTS_DIR}")
        st.info("Please run a random CV search first to generate results.")
        return
    # File selection
    file_options = {str(f.parent.name): f for f in result_files}
    selected_file_name = st.sidebar.selectbox(
        "Select Result File", options=list(file_options.keys()), help="Choose a search result file to visualize"
    )
    selected_file = file_options[selected_file_name]
    # Load and prepare data
    with st.spinner("Loading results..."):
        results = load_and_prepare_results(selected_file)
    st.sidebar.success(f"Loaded {len(results)} results")
    # Metric selection
    available_metrics = ["accuracy", "recall", "precision", "f1", "jaccard"]
    selected_metric = st.sidebar.selectbox(
        "Select Metric", options=available_metrics, help="Choose which metric to visualize"
    )
    # Percentile normalization option
    use_percentile = st.sidebar.checkbox(
        "Use Percentile Normalization", value=True, help="Apply percentile-based color normalization to plots"
    )
    # Display some basic statistics
    st.header("Dataset Overview")
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Runs", len(results))
    with col2:
        best_score = results[f"mean_test_{selected_metric}"].max()
        st.metric(f"Best {selected_metric.capitalize()}", f"{best_score:.4f}")
    with col3:
        best_idx = results[f"mean_test_{selected_metric}"].idxmax()
        best_k = results.loc[best_idx, "initial_K"]
        st.metric("Best K", f"{best_k:.0f}")
    # Show best parameters
    with st.expander("Best Parameters"):
        best_idx = results[f"mean_test_{selected_metric}"].idxmax()
        best_params = results.loc[best_idx, ["initial_K", "eps_cl", "eps_e", f"mean_test_{selected_metric}"]]
        st.dataframe(best_params.to_frame().T, use_container_width=True)
    # Main plots
    st.header(f"Visualization for {selected_metric.capitalize()}")
    # K-binned plots
    st.subheader("K-Binned Parameter Space (Mean)")
    with st.spinner("Generating mean plot..."):
        if use_percentile:
            fig1 = _plot_k_binned(results, f"mean_test_{selected_metric}", vmin_percentile=50)
        else:
            fig1 = _plot_k_binned(results, f"mean_test_{selected_metric}")
        st.pyplot(fig1.figure)
        plt.close()
    st.subheader("K-Binned Parameter Space (Std)")
    with st.spinner("Generating std plot..."):
        if use_percentile:
            fig2 = _plot_k_binned(results, f"std_test_{selected_metric}", vmax_percentile=50)
        else:
            fig2 = _plot_k_binned(results, f"std_test_{selected_metric}")
        st.pyplot(fig2.figure)
        plt.close()
    # Epsilon-binned plots
    col1, col2 = st.columns(2)
    with col1:
        st.subheader("K vs eps_cl")
        with st.spinner("Generating eps_cl plot..."):
            fig3 = _plot_eps_binned(results, "eps_cl", f"mean_test_{selected_metric}")
            st.pyplot(fig3.figure)
            plt.close()
    with col2:
        st.subheader("K vs eps_e")
        with st.spinner("Generating eps_e plot..."):
            fig4 = _plot_eps_binned(results, "eps_e", f"mean_test_{selected_metric}")
            st.pyplot(fig4.figure)
            plt.close()
    # Optional: Raw data table
    with st.expander("View Raw Results Data"):
        st.dataframe(results, use_container_width=True)
 if __name__ == "__main__":
    main()