Add Dataset Analysis to the overview page

2025-12-28 15:31:51 +01:00 · 2025-12-28 15:31:51 +01:00 · a304c96e4e
commit a304c96e4e
parent 6960571742
1 changed files with 641 additions and 3 deletions
--- a/src/entropice/dashboard/overview_page.py
+++ b/src/entropice/dashboard/overview_page.py
@ -3,9 +3,639 @@
 from datetime import datetime
 import pandas as pd
 import plotly.express as px
 import streamlit as st
 from entropice.dashboard.plots.colors import get_palette
 from entropice.dashboard.utils.data import load_all_training_results
 from entropice.dataset import DatasetEnsemble
 def render_sample_count_overview():
    """Render overview of sample counts per task+target+grid+level combination."""
    st.subheader("📊 Sample Counts by Configuration")
    st.markdown(
        """
        This visualization shows the number of available samples for each combination of:
        - **Task**: binary, count, density
        - **Target Dataset**: darts_rts, darts_mllabels
        - **Grid System**: hex, healpix
        - **Grid Level**: varying by grid type
        """
    )
    # Define all possible grid configurations
    grid_configs = [
        ("hex", 3),
        ("hex", 4),
        ("hex", 5),
        ("hex", 6),
        ("healpix", 6),
        ("healpix", 7),
        ("healpix", 8),
        ("healpix", 9),
        ("healpix", 10),
    ]
    target_datasets = ["darts_rts", "darts_mllabels"]
    tasks = ["binary", "count", "density"]
    # Collect sample counts
    sample_data = []
    with st.spinner("Computing sample counts for all configurations..."):
        for grid, level in grid_configs:
            for target in target_datasets:
                # Create minimal ensemble just to get target data
                ensemble = DatasetEnsemble(grid=grid, level=level, target=target, members=[])  # type: ignore[arg-type]
                # Read target data
                targets = ensemble._read_target()
                for task in tasks:
                    # Get task-specific column
                    taskcol = ensemble.taskcol(task)  # type: ignore[arg-type]
                    covcol = ensemble.covcol
                    # Count samples with coverage and valid labels
                    if covcol in targets.columns and taskcol in targets.columns:
                        valid_coverage = targets[covcol].sum()
                        valid_labels = targets[taskcol].notna().sum()
                        valid_both = (targets[covcol] & targets[taskcol].notna()).sum()
                        sample_data.append(
                            {
                                "Grid": f"{grid}-{level}",
                                "Grid Type": grid,
                                "Level": level,
                                "Target": target.replace("darts_", ""),
                                "Task": task.capitalize(),
                                "Samples (Coverage)": valid_coverage,
                                "Samples (Labels)": valid_labels,
                                "Samples (Both)": valid_both,
                                "Grid_Level_Sort": f"{grid}_{level:02d}",
                            }
                        )
    sample_df = pd.DataFrame(sample_data)
    # Create tabs for different views
    tab1, tab2, tab3 = st.tabs(["📈 Heatmap", "📊 Bar Chart", "📋 Data Table"])
    with tab1:
        st.markdown("### Sample Counts Heatmap")
        st.markdown("Showing counts of samples with both coverage and valid labels")
        # Create heatmap for each target dataset
        for target in target_datasets:
            target_df = sample_df[sample_df["Target"] == target.replace("darts_", "")]
            # Pivot for heatmap: Grid x Task
            pivot_df = target_df.pivot_table(index="Grid", columns="Task", values="Samples (Both)", aggfunc="mean")
            # Sort index by grid type and level
            sort_order = sample_df[["Grid", "Grid_Level_Sort"]].drop_duplicates().set_index("Grid")
            pivot_df = pivot_df.reindex(sort_order.sort_values("Grid_Level_Sort").index)
            # Get color palette for sample counts
            sample_colors = get_palette(f"sample_counts_{target}", n_colors=10)
            fig = px.imshow(
                pivot_df,
                labels={"x": "Task", "y": "Grid Configuration", "color": "Sample Count"},
                x=pivot_df.columns,
                y=pivot_df.index,
                color_continuous_scale=sample_colors,
                aspect="auto",
                title=f"Target: {target}",
            )
            # Add text annotations
            fig.update_traces(text=pivot_df.values, texttemplate="%{text:,}", textfont_size=10)
            fig.update_layout(height=400)
            st.plotly_chart(fig, use_container_width=True)
    with tab2:
        st.markdown("### Sample Counts Bar Chart")
        st.markdown("Showing counts of samples with both coverage and valid labels")
        # Create a faceted bar chart showing both targets side by side
        # Get color palette for tasks
        n_tasks = sample_df["Task"].nunique()
        task_colors = get_palette("task_types", n_colors=n_tasks)
        fig = px.bar(
            sample_df,
            x="Grid",
            y="Samples (Both)",
            color="Task",
            facet_col="Target",
            barmode="group",
            title="Sample Counts by Grid Configuration and Target Dataset",
            labels={"Grid": "Grid Configuration", "Samples (Both)": "Number of Samples"},
            color_discrete_sequence=task_colors,
            height=500,
        )
        # Update facet labels to be cleaner
        fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
        fig.update_xaxes(tickangle=-45)
        st.plotly_chart(fig, use_container_width=True)
    with tab3:
        st.markdown("### Detailed Sample Counts")
        # Display full table with formatting
        display_df = sample_df[
            ["Grid", "Target", "Task", "Samples (Coverage)", "Samples (Labels)", "Samples (Both)"]
        ].copy()
        # Format numbers with commas
        for col in ["Samples (Coverage)", "Samples (Labels)", "Samples (Both)"]:
            display_df[col] = display_df[col].apply(lambda x: f"{x:,}")
        st.dataframe(display_df, hide_index=True, use_container_width=True)
@st.fragment
 def render_feature_count_fragment():
    """Render interactive feature count visualization using fragments."""
    st.subheader("🔢 Feature Counts by Dataset Configuration")
    st.markdown(
        """
        This visualization shows the total number of features that would be generated
        for different combinations of data sources and grid configurations.
        """
    )
    # First section: Comparison across all grid configurations
    st.markdown("### Feature Count Comparison Across Grid Configurations")
    st.markdown("Comparing feature counts for all grid configurations with all data sources enabled")
    # Define all possible grid configurations
    grid_configs = [
        ("hex", 3),
        ("hex", 4),
        ("hex", 5),
        ("hex", 6),
        ("healpix", 6),
        ("healpix", 7),
        ("healpix", 8),
        ("healpix", 9),
        ("healpix", 10),
    ]
    # Collect feature statistics for all configurations
    feature_comparison_data = []
    with st.spinner("Computing feature counts for all grid configurations..."):
        for grid, level in grid_configs:
            # Determine which members are available for this configuration
            disable_alphaearth = (grid == "healpix" and level == 10) or (grid == "hex" and level == 6)
            if disable_alphaearth:
                members = ["ArcticDEM", "ERA5-yearly", "ERA5-seasonal", "ERA5-shoulder"]
            else:
                members = ["AlphaEarth", "ArcticDEM", "ERA5-yearly", "ERA5-seasonal", "ERA5-shoulder"]
            # Use darts_rts as default target for comparison
            ensemble = DatasetEnsemble(grid=grid, level=level, target="darts_rts", members=members)  # type: ignore[arg-type]
            stats = ensemble.get_stats()
            # Calculate minimum cells across all data sources
            min_cells = min(
                member_stats["dimensions"]["cell_ids"]  # type: ignore[index]
                for member_stats in stats["members"].values()
            )
            feature_comparison_data.append(
                {
                    "Grid": f"{grid}-{level}",
                    "Grid Type": grid,
                    "Level": level,
                    "Total Features": stats["total_features"],
                    "Data Sources": len(members),
                    "Inference Cells": min_cells,
                    "Total Samples": stats["num_target_samples"],
                    "AlphaEarth": "AlphaEarth" in members,
                    "Grid_Level_Sort": f"{grid}_{level:02d}",
                }
            )
    comparison_df = pd.DataFrame(feature_comparison_data)
    # Create tabs for different comparison views
    comp_tab1, comp_tab2, comp_tab3 = st.tabs(["📊 Bar Chart", "📈 Breakdown", "📋 Data Table"])
    with comp_tab1:
        st.markdown("#### Total Features by Grid Configuration")
        # Collect breakdown data for stacked bar chart
        stacked_data = []
        for idx, row in comparison_df.iterrows():
            grid_config = row["Grid"]
            grid, level_str = grid_config.split("-")
            level = int(level_str)
            disable_alphaearth = (grid == "healpix" and level == 10) or (grid == "hex" and level == 6)
            if disable_alphaearth:
                members = ["ArcticDEM", "ERA5-yearly", "ERA5-seasonal", "ERA5-shoulder"]
            else:
                members = ["AlphaEarth", "ArcticDEM", "ERA5-yearly", "ERA5-seasonal", "ERA5-shoulder"]
            ensemble = DatasetEnsemble(grid=grid, level=level, target="darts_rts", members=members)  # type: ignore[arg-type]
            stats = ensemble.get_stats()
            # Add data for each member
            for member, member_stats in stats["members"].items():
                stacked_data.append(
                    {
                        "Grid": grid_config,
                        "Data Source": member,
                        "Number of Features": member_stats["num_features"],
                        "Grid_Level_Sort": row["Grid_Level_Sort"],
                    }
                )
            # Add lon/lat
            if ensemble.add_lonlat:
                stacked_data.append(
                    {
                        "Grid": grid_config,
                        "Data Source": "Lon/Lat",
                        "Number of Features": 2,
                        "Grid_Level_Sort": row["Grid_Level_Sort"],
                    }
                )
        stacked_df = pd.DataFrame(stacked_data)
        stacked_df = stacked_df.sort_values("Grid_Level_Sort")
        # Get color palette for data sources
        unique_sources = stacked_df["Data Source"].unique()
        n_sources = len(unique_sources)
        source_colors = get_palette("data_sources", n_colors=n_sources)
        # Create stacked bar chart
        fig = px.bar(
            stacked_df,
            x="Grid",
            y="Number of Features",
            color="Data Source",
            barmode="stack",
            title="Total Features by Data Source Across Grid Configurations",
            labels={"Grid": "Grid Configuration", "Number of Features": "Number of Features"},
            color_discrete_sequence=source_colors,
            text_auto=False,
        )
        fig.update_layout(height=500, xaxis_tickangle=-45)
        st.plotly_chart(fig, use_container_width=True)
        # Add secondary metrics
        col1, col2 = st.columns(2)
        with col1:
            # Get color palette for grid configs
            n_grids = len(comparison_df)
            grid_colors = get_palette("grid_configs", n_colors=n_grids)
            fig_cells = px.bar(
                comparison_df,
                x="Grid",
                y="Inference Cells",
                color="Grid",
                title="Inference Cells by Grid Configuration",
                labels={"Grid": "Grid Configuration", "Inference Cells": "Number of Cells"},
                color_discrete_sequence=grid_colors,
                text="Inference Cells",
            )
            fig_cells.update_traces(texttemplate="%{text:,}", textposition="outside")
            fig_cells.update_layout(xaxis_tickangle=-45, showlegend=False)
            st.plotly_chart(fig_cells, use_container_width=True)
        with col2:
            fig_samples = px.bar(
                comparison_df,
                x="Grid",
                y="Total Samples",
                color="Grid",
                title="Total Samples by Grid Configuration",
                labels={"Grid": "Grid Configuration", "Total Samples": "Number of Samples"},
                color_discrete_sequence=grid_colors,
                text="Total Samples",
            )
            fig_samples.update_traces(texttemplate="%{text:,}", textposition="outside")
            fig_samples.update_layout(xaxis_tickangle=-45, showlegend=False)
            st.plotly_chart(fig_samples, use_container_width=True)
    with comp_tab2:
        st.markdown("#### Feature Breakdown by Data Source")
        st.markdown("Showing percentage contribution of each data source across all grid configurations")
        # Collect breakdown data for all grid configurations
        all_breakdown_data = []
        for idx, row in comparison_df.iterrows():
            grid_config = row["Grid"]
            grid, level_str = grid_config.split("-")
            level = int(level_str)
            disable_alphaearth = (grid == "healpix" and level == 10) or (grid == "hex" and level == 6)
            if disable_alphaearth:
                members = ["ArcticDEM", "ERA5-yearly", "ERA5-seasonal", "ERA5-shoulder"]
            else:
                members = ["AlphaEarth", "ArcticDEM", "ERA5-yearly", "ERA5-seasonal", "ERA5-shoulder"]
            ensemble = DatasetEnsemble(grid=grid, level=level, target="darts_rts", members=members)  # type: ignore[arg-type]
            stats = ensemble.get_stats()
            total_features = stats["total_features"]
            # Add data for each member with percentage
            for member, member_stats in stats["members"].items():
                percentage = (member_stats["num_features"] / total_features) * 100  # type: ignore[operator]
                all_breakdown_data.append(
                    {
                        "Grid": grid_config,
                        "Data Source": member,
                        "Percentage": percentage,
                        "Number of Features": member_stats["num_features"],
                        "Grid_Level_Sort": row["Grid_Level_Sort"],
                    }
                )
            # Add lon/lat
            if ensemble.add_lonlat:
                percentage = (2 / total_features) * 100  # type: ignore[operator]
                all_breakdown_data.append(
                    {
                        "Grid": grid_config,
                        "Data Source": "Lon/Lat",
                        "Percentage": percentage,
                        "Number of Features": 2,
                        "Grid_Level_Sort": row["Grid_Level_Sort"],
                    }
                )
        breakdown_all_df = pd.DataFrame(all_breakdown_data)
        # Sort by grid configuration
        breakdown_all_df = breakdown_all_df.sort_values("Grid_Level_Sort")
        # Get color palette for data sources
        unique_sources = breakdown_all_df["Data Source"].unique()
        n_sources = len(unique_sources)
        source_colors = get_palette("data_sources", n_colors=n_sources)
        # Create donut charts for each grid configuration
        # Organize in a grid layout
        num_grids = len(comparison_df)
        cols_per_row = 3
        num_rows = (num_grids + cols_per_row - 1) // cols_per_row
        for row_idx in range(num_rows):
            cols = st.columns(cols_per_row)
            for col_idx in range(cols_per_row):
                grid_idx = row_idx * cols_per_row + col_idx
                if grid_idx < num_grids:
                    grid_config = comparison_df.iloc[grid_idx]["Grid"]
                    grid_data = breakdown_all_df[breakdown_all_df["Grid"] == grid_config]
                    with cols[col_idx]:
                        fig = px.pie(
                            grid_data,
                            names="Data Source",
                            values="Number of Features",
                            title=grid_config,
                            hole=0.4,
                            color_discrete_sequence=source_colors,
                        )
                        fig.update_traces(textposition="inside", textinfo="percent")
                        fig.update_layout(showlegend=True, height=350)
                        st.plotly_chart(fig, use_container_width=True)
    with comp_tab3:
        st.markdown("#### Detailed Feature Count Comparison")
        # Display full comparison table with formatting
        display_df = comparison_df[
            ["Grid", "Total Features", "Data Sources", "Inference Cells", "Total Samples", "AlphaEarth"]
        ].copy()
        # Format numbers with commas
        for col in ["Total Features", "Inference Cells", "Total Samples"]:
            display_df[col] = display_df[col].apply(lambda x: f"{x:,}")
        # Format boolean as Yes/No
        display_df["AlphaEarth"] = display_df["AlphaEarth"].apply(lambda x: "✓" if x else "✗")
        st.dataframe(display_df, hide_index=True, use_container_width=True)
    st.divider()
    # Second section: Detailed configuration with user selection
    st.markdown("### Detailed Configuration Explorer")
    st.markdown("Select specific grid configuration and data sources for detailed statistics")
    # Grid selection
    grid_options = [
        "hex-3",
        "hex-4",
        "hex-5",
        "hex-6",
        "healpix-6",
        "healpix-7",
        "healpix-8",
        "healpix-9",
        "healpix-10",
    ]
    col1, col2 = st.columns(2)
    with col1:
        grid_level_combined = st.selectbox(
            "Grid Configuration",
            options=grid_options,
            index=0,
            help="Select the grid system and resolution level",
            key="feature_grid_select",
        )
    with col2:
        target = st.selectbox(
            "Target Dataset",
            options=["darts_rts", "darts_mllabels"],
            index=0,
            help="Select the target dataset",
            key="feature_target_select",
        )
    # Parse grid type and level
    grid, level_str = grid_level_combined.split("-")
    level = int(level_str)
    # Members selection
    st.markdown("#### Select Data Sources")
    # Check if AlphaEarth should be disabled
    disable_alphaearth = (grid == "healpix" and level == 10) or (grid == "hex" and level == 6)
    all_members = ["AlphaEarth", "ArcticDEM", "ERA5-yearly", "ERA5-seasonal", "ERA5-shoulder"]
    # Use columns for checkboxes
    cols = st.columns(len(all_members))
    selected_members = []
    for idx, member in enumerate(all_members):
        with cols[idx]:
            if member == "AlphaEarth" and disable_alphaearth:
                st.checkbox(
                    member,
                    value=False,
                    disabled=True,
                    help=f"Not available for {grid} level {level}",
                    key=f"feature_member_{member}",
                )
            else:
                if st.checkbox(member, value=True, key=f"feature_member_{member}"):
                    selected_members.append(member)
    # Show results if at least one member is selected
    if selected_members:
        st.markdown("---")
        ensemble = DatasetEnsemble(grid=grid, level=level, target=target, members=selected_members)
        with st.spinner("Computing dataset statistics..."):
            stats = ensemble.get_stats()
        # High-level metrics
        col1, col2, col3, col4, col5 = st.columns(5)
        with col1:
            st.metric("Total Features", f"{stats['total_features']:,}")
        with col2:
            # Calculate minimum cells across all data sources (for inference capability)
            min_cells = min(
                member_stats["dimensions"]["cell_ids"]  # type: ignore[index]
                for member_stats in stats["members"].values()
            )
            st.metric("Inference Cells", f"{min_cells:,}", help="Number of union of cells across all data sources")
        with col3:
            st.metric("Data Sources", len(selected_members))
        with col4:
            st.metric("Total Samples", f"{stats['num_target_samples']:,}")
        with col5:
            # Calculate total data points
            total_points = stats["total_features"] * stats["num_target_samples"]
            st.metric("Total Data Points", f"{total_points:,}")
        # Feature breakdown by source
        st.markdown("#### Feature Breakdown by Data Source")
        breakdown_data = []
        for member, member_stats in stats["members"].items():
            breakdown_data.append(
                {
                    "Data Source": member,
                    "Number of Features": member_stats["num_features"],
                    "Percentage": f"{member_stats['num_features'] / stats['total_features'] * 100:.1f}%",  # type: ignore[operator]
                }
            )
        # Add lon/lat
        if ensemble.add_lonlat:
            breakdown_data.append(
                {
                    "Data Source": "Lon/Lat",
                    "Number of Features": 2,
                    "Percentage": f"{2 / stats['total_features'] * 100:.1f}%",
                }
            )
        breakdown_df = pd.DataFrame(breakdown_data)
        # Get color palette for data sources
        n_sources = len(breakdown_df)
        source_colors = get_palette("data_sources", n_colors=n_sources)
        # Create pie chart
        fig = px.pie(
            breakdown_df,
            names="Data Source",
            values="Number of Features",
            title="Feature Distribution by Data Source",
            hole=0.4,
            color_discrete_sequence=source_colors,
        )
        fig.update_traces(textposition="inside", textinfo="percent+label")
        st.plotly_chart(fig, use_container_width=True)
        # Show detailed table
        st.dataframe(breakdown_df, hide_index=True, use_container_width=True)
        # Detailed member information
        with st.expander("📦 Detailed Source Information", expanded=False):
            for member, member_stats in stats["members"].items():
                st.markdown(f"### {member}")
                metric_cols = st.columns(4)
                with metric_cols[0]:
                    st.metric("Features", member_stats["num_features"])
                with metric_cols[1]:
                    st.metric("Variables", member_stats["num_variables"])
                with metric_cols[2]:
                    dim_str = " x ".join([str(dim) for dim in member_stats["dimensions"].values()])  # type: ignore[union-attr]
                    st.metric("Shape", dim_str)
                with metric_cols[3]:
                    total_points = 1
                    for dim_size in member_stats["dimensions"].values():  # type: ignore[union-attr]
                        total_points *= dim_size
                    st.metric("Data Points", f"{total_points:,}")
                # Variables
                st.markdown("**Variables:**")
                vars_html = " ".join(
                    [
                        f'<span style="background-color: #e3f2fd; color: #1976d2; padding: 4px 8px; '
                        f'border-radius: 4px; margin: 2px; display: inline-block; font-size: 0.9em;">{v}</span>'
                        for v in member_stats["variables"]  # type: ignore[union-attr]
                    ]
                )
                st.markdown(vars_html, unsafe_allow_html=True)
                # Dimensions
                st.markdown("**Dimensions:**")
                dim_html = " ".join(
                    [
                        f'<span style="background-color: #f3e5f5; color: #7b1fa2; padding: 4px 8px; '
                        f'border-radius: 4px; margin: 2px; display: inline-block; font-size: 0.9em;">'
                        f"{dim_name}: {dim_size}</span>"
                        for dim_name, dim_size in member_stats["dimensions"].items()  # type: ignore[union-attr]
                    ]
                )
                st.markdown(dim_html, unsafe_allow_html=True)
                st.markdown("---")
    else:
        st.info("👆 Select at least one data source to see feature statistics")
 def render_dataset_analysis():
    """Render the dataset analysis section with sample and feature counts."""
    st.header("📈 Dataset Analysis")
    # Create tabs for the two different analyses
    analysis_tabs = st.tabs(["📊 Sample Counts", "🔢 Feature Counts"])
    with analysis_tabs[0]:
        render_sample_count_overview()
    with analysis_tabs[1]:
        render_feature_count_fragment()
 def render_overview_page():
@ -20,8 +650,10 @@ def render_overview_page():
    st.write(f"Found **{len(training_results)}** training result(s)")
    st.divider()
    # Summary statistics at the top
-    st.subheader("Summary Statistics")
+    st.header("📊 Training Results Summary")
    col1, col2, col3, col4 = st.columns(4)
    with col1:
@ -43,8 +675,14 @@ def render_overview_page():
    st.divider()
    # Add dataset analysis section
    render_dataset_analysis()
    st.divider()
    # Detailed results table
-    st.subheader("Training Results")
+    st.header("🎯 Experiment Results")
    st.subheader("Results Table")
    # Build a summary dataframe
    summary_data = []
@ -93,7 +731,7 @@ def render_overview_page():
    st.divider()
    # Expandable details for each result
-    st.subheader("Detailed Results")
+    st.subheader("Individual Experiment Details")
    for tr in training_results:
        with st.expander(tr.get_display_name("task_first")):