Add a Storage Analysis Section

2026-01-18 20:36:47 +01:00 · 2026-01-18 20:36:47 +01:00 · 2664579a75
commit 2664579a75
parent 636c034b55
9 changed files with 370 additions and 1959 deletions
--- a/src/entropice/dashboard/app.py
+++ b/src/entropice/dashboard/app.py
@ -18,7 +18,6 @@ from entropice.dashboard.views.inference_page import render_inference_page
 from entropice.dashboard.views.model_state_page import render_model_state_page
 from entropice.dashboard.views.overview_page import render_overview_page
 from entropice.dashboard.views.training_analysis_page import render_training_analysis_page
 from entropice.dashboard.views.training_data_page import render_training_data_page
 def main():
@ -28,7 +27,6 @@ def main():
    # Setup Navigation
    overview_page = st.Page(render_overview_page, title="Overview", icon="🏡", default=True)
    data_page = st.Page(render_dataset_page, title="Dataset", icon="📊")
    training_data_page = st.Page(render_training_data_page, title="Training Data", icon="🎞️")
    training_analysis_page = st.Page(render_training_analysis_page, title="Training Results Analysis", icon="🦾")
    autogluon_page = st.Page(render_autogluon_analysis_page, title="AutoGluon Analysis", icon="🤖")
    model_state_page = st.Page(render_model_state_page, title="Model State", icon="🧮")
@ -38,8 +36,7 @@ def main():
        {
            "Overview": [overview_page],
            "Data": [data_page],
-            "Training": [training_data_page, training_analysis_page, autogluon_page],
+            "Experiments": [training_analysis_page, autogluon_page, model_state_page],
            "Model State": [model_state_page],
            "Inference": [inference_page],
        }
    )
--- a/src/entropice/dashboard/plots/source_data.py
+++ b/src/entropice/dashboard/plots/source_data.py
--- a/src/entropice/dashboard/plots/training_data.py
+++ b/src/entropice/dashboard/plots/training_data.py
@ -1,366 +0,0 @@
 """Plotting functions for training data visualizations."""
 import geopandas as gpd
 import pandas as pd
 import plotly.graph_objects as go
 import pydeck as pdk
 import streamlit as st
 from entropice.dashboard.utils.colors import get_palette
 from entropice.dashboard.utils.geometry import fix_hex_geometry
 from entropice.ml.dataset import CategoricalTrainingDataset
 def render_all_distribution_histograms(
    train_data_dict: dict[str, CategoricalTrainingDataset],
 ):
    """Render histograms for all three tasks side by side.
    Args:
        train_data_dict: Dictionary with keys 'binary', 'count', 'density' and CategoricalTrainingDataset values.
    """
    st.subheader("📊 Target Distribution by Task")
    # Create a 3-column layout for the three tasks
    cols = st.columns(3)
    tasks = ["binary", "count", "density"]
    task_titles = {
        "binary": "Binary Classification",
        "count": "Count Classification",
        "density": "Density Classification",
    }
    for idx, task in enumerate(tasks):
        dataset = train_data_dict[task]
        categories = dataset.y.binned.cat.categories.tolist()
        colors = get_palette(task, len(categories))
        with cols[idx]:
            st.markdown(f"**{task_titles[task]}**")
            # Create histogram data
            counts_df = pd.DataFrame(
                {
                    "Category": categories,
                    "Train": [((dataset.y.binned == cat) & (dataset.split == "train")).sum() for cat in categories],
                    "Test": [((dataset.y.binned == cat) & (dataset.split == "test")).sum() for cat in categories],
                }
            )
            # Create stacked bar chart
            fig = go.Figure()
            fig.add_trace(
                go.Bar(
                    name="Train",
                    x=counts_df["Category"],
                    y=counts_df["Train"],
                    marker_color=colors,
                    opacity=0.9,
                    text=counts_df["Train"],
                    textposition="inside",
                    textfont={"size": 10, "color": "white"},
                )
            )
            fig.add_trace(
                go.Bar(
                    name="Test",
                    x=counts_df["Category"],
                    y=counts_df["Test"],
                    marker_color=colors,
                    opacity=0.6,
                    text=counts_df["Test"],
                    textposition="inside",
                    textfont={"size": 10, "color": "white"},
                )
            )
            fig.update_layout(
                barmode="group",
                height=400,
                margin={"l": 20, "r": 20, "t": 20, "b": 20},
                showlegend=True,
                legend={
                    "orientation": "h",
                    "yanchor": "bottom",
                    "y": 1.02,
                    "xanchor": "right",
                    "x": 1,
                },
                xaxis_title=None,
                yaxis_title="Count",
                xaxis={"tickangle": -45},
            )
            st.plotly_chart(fig, width="stretch")
            # Show summary statistics
            total = len(dataset)
            train_pct = (dataset.split == "train").sum() / total * 100
            test_pct = (dataset.split == "test").sum() / total * 100
            st.caption(f"Total: {total:,} | Train: {train_pct:.1f}% | Test: {test_pct:.1f}%")
 def _assign_colors_by_mode(gdf, color_mode, dataset, selected_task):
    """Assign colors to geodataframe based on the selected color mode.
    Args:
        gdf: GeoDataFrame to add colors to
        color_mode: One of 'target_class' or 'split'
        dataset: CategoricalTrainingDataset
        selected_task: Task name for color palette selection
    Returns:
        GeoDataFrame with 'fill_color' column added
    """
    if color_mode == "target_class":
        categories = dataset.y.binned.cat.categories.tolist()
        colors_palette = get_palette(selected_task, len(categories))
        # Create color mapping
        color_map = {cat: colors_palette[i] for i, cat in enumerate(categories)}
        gdf["color"] = gdf["target_class"].map(color_map)
        # Convert hex colors to RGB
        def hex_to_rgb(hex_color):
            hex_color = hex_color.lstrip("#")
            return [int(hex_color[i : i + 2], 16) for i in (0, 2, 4)]
        gdf["fill_color"] = gdf["color"].apply(hex_to_rgb)
    elif color_mode == "split":
        split_colors = {
            "train": [66, 135, 245],
            "test": [245, 135, 66],
        }  # Blue  # Orange
        gdf["fill_color"] = gdf["split"].map(split_colors)
    return gdf
@st.fragment
 def render_spatial_map(train_data_dict: dict[str, CategoricalTrainingDataset]):
    """Render a pydeck spatial map showing training data distribution with interactive controls.
    This is a Streamlit fragment that reruns independently when users interact with the
    visualization controls (color mode and opacity), without re-running the entire page.
    Args:
        train_data_dict: Dictionary with keys 'binary', 'count', 'density' and CategoricalTrainingDataset values.
    """
    st.subheader("🗺️ Spatial Distribution Map")
    # Create controls in columns
    col1, col2 = st.columns([3, 1])
    with col1:
        vis_mode = st.selectbox(
            "Visualization mode",
            options=["binary", "count", "density", "split"],
            format_func=lambda x: x.capitalize() if x != "split" else "Train/Test Split",
            key="spatial_map_mode",
        )
    with col2:
        opacity = st.slider(
            "Opacity",
            min_value=0.1,
            max_value=1.0,
            value=0.7,
            step=0.1,
            key="spatial_map_opacity",
        )
    # Determine which task dataset to use and color mode
    if vis_mode == "split":
        # Use binary dataset for split visualization
        dataset = train_data_dict["binary"]
        color_mode = "split"
        selected_task = "binary"
    else:
        # Use the selected task
        dataset = train_data_dict[vis_mode]
        color_mode = "target_class"
        selected_task = vis_mode
    # Prepare data for visualization - dataset.dataset should already be a GeoDataFrame
    gdf: gpd.GeoDataFrame = dataset.dataset.copy()  # type: ignore[assignment]
    # Fix antimeridian issues
    gdf["geometry"] = gdf["geometry"].apply(fix_hex_geometry)
    # Add binned labels and split information from current dataset
    gdf["target_class"] = dataset.y.binned.to_numpy()
    gdf["split"] = dataset.split.to_numpy()
    gdf["raw_value"] = dataset.z.to_numpy()
    # Add information from all three tasks for tooltip
    gdf["binary_label"] = train_data_dict["binary"].y.binned.to_numpy()
    gdf["count_category"] = train_data_dict["count"].y.binned.to_numpy()
    gdf["count_raw"] = train_data_dict["count"].z.to_numpy()
    gdf["density_category"] = train_data_dict["density"].y.binned.to_numpy()
    gdf["density_raw"] = train_data_dict["density"].z.to_numpy()
    # Convert to WGS84 for pydeck
    gdf_wgs84: gpd.GeoDataFrame = gdf.to_crs("EPSG:4326")  # type: ignore[assignment]
    # Assign colors based on the selected mode
    gdf_wgs84 = _assign_colors_by_mode(gdf_wgs84, color_mode, dataset, selected_task)
    # Convert to GeoJSON format and add elevation for 3D visualization
    geojson_data = []
    # Normalize raw values for elevation (only for count and density)
    use_elevation = vis_mode in ["count", "density"]
    if use_elevation:
        raw_values = gdf_wgs84["raw_value"]
        min_val, max_val = raw_values.min(), raw_values.max()
        # Normalize to 0-1 range for better 3D visualization
        if max_val > min_val:
            gdf_wgs84["elevation"] = ((raw_values - min_val) / (max_val - min_val)).fillna(0)
        else:
            gdf_wgs84["elevation"] = 0
    for _, row in gdf_wgs84.iterrows():
        feature = {
            "type": "Feature",
            "geometry": row["geometry"].__geo_interface__,
            "properties": {
                "target_class": str(row["target_class"]),
                "split": str(row["split"]),
                "raw_value": float(row["raw_value"]),
                "fill_color": row["fill_color"],
                "elevation": float(row["elevation"]) if use_elevation else 0,
                "binary_label": str(row["binary_label"]),
                "count_category": str(row["count_category"]),
                "count_raw": int(row["count_raw"]),
                "density_category": str(row["density_category"]),
                "density_raw": f"{float(row['density_raw']):.4f}",
            },
        }
        geojson_data.append(feature)
    # Create pydeck layer
    layer = pdk.Layer(
        "GeoJsonLayer",
        geojson_data,
        opacity=opacity,
        stroked=True,
        filled=True,
        extruded=use_elevation,
        wireframe=False,
        get_fill_color="properties.fill_color",
        get_line_color=[80, 80, 80],
        line_width_min_pixels=0.5,
        get_elevation="properties.elevation" if use_elevation else 0,
        elevation_scale=500000,  # Scale normalized values (0-1) to 500km height
        pickable=True,
    )
    # Set initial view state (centered on the Arctic)
    # Adjust pitch and zoom based on whether we're using elevation
    view_state = pdk.ViewState(
        latitude=70,
        longitude=0,
        zoom=2 if not use_elevation else 1.5,
        pitch=0 if not use_elevation else 45,
    )
    # Create deck
    deck = pdk.Deck(
        layers=[layer],
        initial_view_state=view_state,
        tooltip={
            "html": "<b>Binary:</b> {binary_label}<br/>"
            "<b>Count Category:</b> {count_category}<br/>"
            "<b>Count Raw:</b> {count_raw}<br/>"
            "<b>Density Category:</b> {density_category}<br/>"
            "<b>Density Raw:</b> {density_raw}<br/>"
            "<b>Split:</b> {split}",
            "style": {"backgroundColor": "steelblue", "color": "white"},
        },
        map_style="https://basemaps.cartocdn.com/gl/dark-matter-gl-style/style.json",
    )
    # Render the map
    st.pydeck_chart(deck)
    # Show info about 3D visualization
    if use_elevation:
        st.info("💡 3D elevation represents raw values. Rotate the map by holding Ctrl/Cmd and dragging.")
    # Add legend
    with st.expander("Legend", expanded=True):
        if color_mode == "target_class":
            st.markdown("**Target Classes:**")
            categories = dataset.y.binned.cat.categories.tolist()
            colors_palette = get_palette(selected_task, len(categories))
            intervals = dataset.y.intervals
            # For count and density tasks, show intervals
            if selected_task in ["count", "density"]:
                for i, cat in enumerate(categories):
                    color = colors_palette[i]
                    interval_min, interval_max = intervals[i]
                    # Format interval display
                    if interval_min is None or interval_max is None:
                        interval_str = ""
                    elif selected_task == "count":
                        # Integer values for count
                        if interval_min == interval_max:
                            interval_str = f" ({int(interval_min)})"
                        else:
                            interval_str = f" ({int(interval_min)}-{int(interval_max)})"
                    else:  # density
                        # Percentage values for density
                        if interval_min == interval_max:
                            interval_str = f" ({interval_min * 100:.4f}%)"
                        else:
                            interval_str = f" ({interval_min * 100:.4f}%-{interval_max * 100:.4f}%)"
                    st.markdown(
                        f'<div style="display: flex; align-items: center; margin-bottom: 4px;">'
                        f'<div style="width: 20px; height: 20px; background-color: {color}; '
                        f'margin-right: 8px; border: 1px solid #ccc; flex-shrink: 0;"></div>'
                        f"<span>{cat}{interval_str}</span></div>",
                        unsafe_allow_html=True,
                    )
            else:
                # Binary task: use original column layout
                legend_cols = st.columns(len(categories))
                for i, cat in enumerate(categories):
                    with legend_cols[i]:
                        color = colors_palette[i]
                        st.markdown(
                            f'<div style="display: flex; align-items: center;">'
                            f'<div style="width: 20px; height: 20px; background-color: {color}; '
                            f'margin-right: 8px; border: 1px solid #ccc;"></div>'
                            f"<span>{cat}</span></div>",
                            unsafe_allow_html=True,
                        )
            if use_elevation:
                st.markdown("---")
                st.markdown("**Elevation (3D):**")
                min_val = gdf_wgs84["raw_value"].min()
                max_val = gdf_wgs84["raw_value"].max()
                st.markdown(f"Height represents raw value: {min_val:.2f} (low) → {max_val:.2f} (high)")
        elif color_mode == "split":
            st.markdown("**Data Split:**")
            legend_html = (
                '<div style="display: flex; gap: 20px;">'
                '<div style="display: flex; align-items: center;">'
                '<div style="width: 20px; height: 20px; background-color: rgb(66, 135, 245); '
                'margin-right: 8px; border: 1px solid #ccc;"></div>'
                "<span>Train</span></div>"
                '<div style="display: flex; align-items: center;">'
                '<div style="width: 20px; height: 20px; background-color: rgb(245, 135, 66); '
                'margin-right: 8px; border: 1px solid #ccc;"></div>'
                "<span>Test</span></div></div>"
            )
            st.markdown(legend_html, unsafe_allow_html=True)
--- a/src/entropice/dashboard/sections/dataset_statistics.py
+++ b/src/entropice/dashboard/sections/dataset_statistics.py
@ -431,7 +431,7 @@ def _render_aggregation_selection(
        if not submitted:
            st.info("👆 Click 'Apply Aggregation Filters' to update the configuration")
-            st.stop()
+            return dimension_filters
    return dimension_filters
--- a/src/entropice/dashboard/sections/experiment_results.py
+++ b/src/entropice/dashboard/sections/experiment_results.py
@ -103,8 +103,7 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
    )
    # Expandable details for each result
-    st.subheader("Individual Experiment Details")
+    with st.expander("Show Individual Experiment Details", expanded=False):
        for tr in filtered_results:
            tr_info = tr.display_info
            display_name = tr_info.get_display_name("model_first")
@ -160,7 +159,7 @@ def render_experiment_results(training_results: list[TrainingResult]):  # noqa:
                                unique_vals = tr.results[param].nunique()
                                st.write(f"- **{param}:** {unique_vals} values ({min_val:.2e} to {max_val:.2e})")
-                with st.expander("Show CV Results DataFrame"):
+                    st.write("**CV Results DataFrame:**")
                    st.dataframe(tr.results, width="stretch", hide_index=True)
                st.write(f"\n**Path:** `{tr.path}`")
--- a/src/entropice/dashboard/sections/storage_statistics.py
+++ b/src/entropice/dashboard/sections/storage_statistics.py
@ -0,0 +1,163 @@
 """Storage Statistics Section for Entropice Dashboard."""
 import pandas as pd
 import plotly.graph_objects as go
 import streamlit as st
 from entropice.dashboard.utils.loaders import StorageInfo, load_storage_statistics
 from entropice.utils.paths import DATA_DIR
 def _format_bytes(bytes_value: int) -> str:
    """Format bytes into human-readable string."""
    value = float(bytes_value)
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if value < 1024.0:
            return f"{value:.2f} {unit}"
        value /= 1024.0
    return f"{value:.2f} PB"
 def _create_storage_bar_chart(storage_infos: list[StorageInfo]) -> go.Figure:
    """Create a horizontal bar chart showing storage usage by subdirectory."""
    if not storage_infos:
        return go.Figure()
    # Prepare data
    names = [info.name for info in storage_infos]
    sizes = [info.size_bytes / (1024**3) for info in storage_infos]  # Convert to GB
    file_counts = [info.file_count for info in storage_infos]
    # Create figure
    fig = go.Figure()
    # Add bar trace
    fig.add_trace(
        go.Bar(
            y=names,
            x=sizes,
            orientation="h",
            text=[f"{s:.2f} GB" for s in sizes],
            textposition="auto",
            hovertemplate="<b>%{y}</b><br>Size: %{x:.2f} GB<br>Files: %{customdata:,}<extra></extra>",
            customdata=file_counts,
            marker={
                "color": sizes,
                "colorscale": "Blues",
                "showscale": False,
            },
        )
    )
    # Update layout
    fig.update_layout(
        title="Storage Usage by Subdirectory",
        xaxis_title="Size (GB)",
        yaxis_title="Directory",
        height=max(400, len(names) * 40),  # Dynamic height based on number of directories
        showlegend=False,
        margin={"l": 200, "r": 50, "t": 50, "b": 50},
    )
    return fig
 def _create_storage_pie_chart(storage_infos: list[StorageInfo]) -> go.Figure:
    """Create a pie chart showing storage distribution."""
    if not storage_infos:
        return go.Figure()
    # Prepare data
    names = [info.name for info in storage_infos]
    sizes = [info.size_bytes for info in storage_infos]
    # Create figure
    fig = go.Figure(
        data=[
            go.Pie(
                labels=names,
                values=sizes,
                textinfo="label+percent",
                hovertemplate="<b>%{label}</b><br>Size: %{customdata}<br>%{percent}<extra></extra>",
                customdata=[info.display_size for info in storage_infos],
            )
        ]
    )
    # Update layout
    fig.update_layout(
        title="Storage Distribution",
        height=500,
    )
    return fig
 def render_storage_statistics():
    """Render the storage statistics section showing disk usage for DATA_DIR subdirectories."""
    st.header("💾 Storage Statistics")
    st.markdown(
        f"""
        This section shows the disk usage of subdirectories in the data directory:
        **`{DATA_DIR}`**
        Data is collected using [dust](https://github.com/bootandy/dust), a modern disk usage analyzer.
        Statistics are cached for 5 minutes to reduce overhead.
        """
    )
    # Load storage statistics
    with st.spinner("Analyzing storage usage..."):
        storage_infos, total_size, total_files = load_storage_statistics()
    if not storage_infos:
        st.warning("No storage data available. The data directory may be empty or inaccessible.")
        return
    # Display summary metrics
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Storage Used", _format_bytes(total_size))
    with col2:
        st.metric("Total Files", f"{total_files:,}")
    with col3:
        st.metric("Number of Subdirectories", len(storage_infos))
    # Create tabs for different visualizations
    tab1, tab2, tab3 = st.tabs(["📊 Bar Chart", "🥧 Pie Chart", "📋 Detailed Table"])
    with tab1:
        st.plotly_chart(_create_storage_bar_chart(storage_infos), use_container_width=True)
    with tab2:
        st.plotly_chart(_create_storage_pie_chart(storage_infos), use_container_width=True)
    with tab3:
        # Create DataFrame for detailed view
        df = pd.DataFrame(
            [
                {
                    "Directory": info.name,
                    "Size": info.display_size,
                    "Size (Bytes)": info.size_bytes,
                    "Files": info.file_count,
                    "Percentage": f"{(info.size_bytes / total_size * 100):.2f}%",
                }
                for info in storage_infos
            ]
        )
        st.dataframe(
            df[["Directory", "Size", "Files", "Percentage"]],
            use_container_width=True,
            hide_index=True,
        )
        # Add download button for detailed data
        st.download_button(
            label="📥 Download Storage Statistics (CSV)",
            data=df.to_csv(index=False),
            file_name="entropice_storage_statistics.csv",
            mime="text/csv",
        )
--- a/src/entropice/dashboard/utils/loaders.py
+++ b/src/entropice/dashboard/utils/loaders.py
@ -1,6 +1,8 @@
 """Data utilities for Entropice dashboard."""
 import json
 import pickle
 import subprocess
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
@ -252,3 +254,148 @@ def load_training_sets(ensemble: DatasetEnsemble) -> dict[TargetDataset, dict[Ta
        for task in all_tasks:
            train_data_dict[target][task] = ensemble.create_training_set(target=target, task=task)
    return train_data_dict
@dataclass
 class StorageInfo:
    """Storage information for a directory."""
    name: str
    size_bytes: int
    file_count: int
    display_size: str
 def _parse_size_to_bytes(size_str: str) -> int:
    """Convert dust's human-readable size string to bytes.
    Examples: "92K" -> 92*1024, "1.5M" -> 1.5*1024*1024, "928B" -> 928
    """
    size_str = size_str.strip().upper()
    if not size_str:
        return 0
    # Extract numeric part and unit
    numeric_part = ""
    unit = ""
    for char in size_str:
        if char.isdigit() or char == ".":
            numeric_part += char
        else:
            unit += char
    try:
        value = float(numeric_part) if numeric_part else 0
    except ValueError:
        return 0
    # Convert based on unit
    unit = unit.strip()
    multipliers = {
        "B": 1,
        "K": 1024,
        "M": 1024**2,
        "G": 1024**3,
        "T": 1024**4,
        "P": 1024**5,
    }
    return int(value * multipliers.get(unit, 1))
 def _run_dust_command(data_dir: Path, for_files: bool = False) -> dict | None:
    """Run dust command and return parsed JSON output.
    Args:
        data_dir: Directory to analyze
        for_files: If True, count files (-f flag); if False, count disk space
    Returns:
        Parsed JSON dict or None if command failed
    """
    cmd = ["dust", "-j", "-d", "1"]
    if for_files:
        cmd.append("-f")
    cmd.append(str(data_dir))
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
        if result.returncode != 0:
            return None
        return json.loads(result.stdout)
    except (subprocess.TimeoutExpired, json.JSONDecodeError):
        return None
 def _build_file_counts_lookup(files_data: dict | None) -> dict[str, int]:
    """Build lookup dict for file counts from dust JSON output."""
    file_counts = {}
    if files_data and "children" in files_data:
        for child in files_data["children"]:
            name = Path(child["name"]).name
            count_str = child.get("size", "0")
            file_counts[name] = _parse_size_to_bytes(count_str)
    return file_counts
@st.cache_data(ttl=300)  # Cache for 5 minutes
 def load_storage_statistics() -> tuple[list[StorageInfo], int, int]:
    """Load storage statistics for DATA_DIR subdirectories using dust.
    Returns:
        Tuple of (subdirectory stats list, total size in bytes, total file count)
    """
    data_dir = entropice.utils.paths.DATA_DIR
    if not data_dir.exists():
        return [], 0, 0
    try:
        # Run dust for disk space and file counts
        space_data = _run_dust_command(data_dir, for_files=False)
        files_data = _run_dust_command(data_dir, for_files=True)
        if not space_data:
            st.warning("Failed to get storage statistics from dust")
            return [], 0, 0
        # Build lookup dict for file counts
        file_counts = _build_file_counts_lookup(files_data)
        # Extract subdirectory information from space data
        storage_infos = []
        total_size = 0
        total_files = 0
        if "children" in space_data:
            for child in space_data["children"]:
                full_path = child.get("name", "")
                dir_name = Path(full_path).name
                size_str = child.get("size", "0")
                size_bytes = _parse_size_to_bytes(size_str)
                file_count = file_counts.get(dir_name, 0)
                storage_infos.append(
                    StorageInfo(
                        name=dir_name,
                        size_bytes=size_bytes,
                        file_count=file_count,
                        display_size=size_str,
                    )
                )
                total_size += size_bytes
                total_files += file_count
        # Sort by size descending
        storage_infos.sort(key=lambda x: x.size_bytes, reverse=True)
        return storage_infos, total_size, total_files
    except FileNotFoundError:
        st.error("dust command not found. Please install dust: https://github.com/bootandy/dust")
        return [], 0, 0
    except Exception as e:
        st.error(f"Error getting storage statistics: {e}")
        return [], 0, 0
--- a/src/entropice/dashboard/views/overview_page.py
+++ b/src/entropice/dashboard/views/overview_page.py
@ -8,6 +8,7 @@ from entropice.dashboard.sections.experiment_results import (
    render_experiment_results,
    render_training_results_summary,
 )
 from entropice.dashboard.sections.storage_statistics import render_storage_statistics
 from entropice.dashboard.utils.loaders import load_all_training_results
 from entropice.dashboard.utils.stats import DatasetStatistics, load_all_default_dataset_statistics
@ -52,5 +53,10 @@ def render_overview_page():
    render_dataset_statistics(all_stats, training_sample_df, feature_breakdown_df, comparison_df, inference_sample_df)
    st.divider()
    # Render storage statistics section
    render_storage_statistics()
    st.balloons()
    stopwatch.summary()
--- a/src/entropice/dashboard/views/training_data_page.py
+++ b/src/entropice/dashboard/views/training_data_page.py
@ -1,481 +0,0 @@
 """Training Data page: Visualization of training data distributions."""
 from typing import cast
 import streamlit as st
 from stopuhr import stopwatch
 from entropice.dashboard.plots.source_data import (
    render_alphaearth_map,
    render_alphaearth_overview,
    render_alphaearth_plots,
    render_arcticdem_map,
    render_arcticdem_overview,
    render_arcticdem_plots,
    render_areas_map,
    render_era5_map,
    render_era5_overview,
    render_era5_plots,
 )
 from entropice.dashboard.plots.training_data import (
    render_all_distribution_histograms,
    render_spatial_map,
 )
 from entropice.dashboard.utils.loaders import load_all_training_data, load_source_data
 from entropice.ml.dataset import CategoricalTrainingDataset, DatasetEnsemble
 from entropice.spatial import grids
 from entropice.utils.types import GridConfig, L2SourceDataset, TargetDataset, Task, grid_configs
 def render_dataset_configuration_sidebar():
    """Render dataset configuration selector in sidebar with form.
    Stores the selected ensemble in session state when form is submitted.
    """
    with st.sidebar.form("dataset_config_form"):
        st.header("Dataset Configuration")
        # Grid selection
        grid_options = [gc.display_name for gc in grid_configs]
        grid_level_combined = st.selectbox(
            "Grid Configuration",
            options=grid_options,
            index=0,
            help="Select the grid system and resolution level",
        )
        # Find the selected grid config
        selected_grid_config: GridConfig = next(gc for gc in grid_configs if gc.display_name == grid_level_combined)
        # Target feature selection
        target = st.selectbox(
            "Target Feature",
            options=["darts_rts", "darts_mllabels"],
            index=0,
            help="Select the target variable for training",
        )
        # Members selection
        st.subheader("Dataset Members")
        all_members = cast(
            list[L2SourceDataset],
            ["AlphaEarth", "ArcticDEM", "ERA5-yearly", "ERA5-seasonal", "ERA5-shoulder"],
        )
        selected_members: list[L2SourceDataset] = []
        for member in all_members:
            if st.checkbox(member, value=True, help=f"Include {member} in the dataset"):
                selected_members.append(member)  # type: ignore[arg-type]
        # Form submit button
        load_button = st.form_submit_button(
            "Load Dataset",
            type="primary",
            use_container_width=True,
            disabled=len(selected_members) == 0,
        )
    # Create DatasetEnsemble only when form is submitted
    if load_button:
        ensemble = DatasetEnsemble(
            grid=selected_grid_config.grid,
            level=selected_grid_config.level,
            target=cast(TargetDataset, target),
            members=selected_members,
        )
        # Store ensemble in session state
        st.session_state["dataset_ensemble"] = ensemble
        st.session_state["dataset_loaded"] = True
 def render_dataset_statistics(ensemble: DatasetEnsemble):
    """Render dataset statistics and configuration overview.
    Args:
        ensemble: The dataset ensemble configuration.
    """
    st.markdown("### 📊 Dataset Configuration")
    # Display current configuration in columns
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric(label="Grid Type", value=ensemble.grid.upper())
    with col2:
        st.metric(label="Grid Level", value=ensemble.level)
    with col3:
        st.metric(label="Target Feature", value=ensemble.target.replace("darts_", ""))
    with col4:
        st.metric(label="Members", value=len(ensemble.members))
    # Display members in an expandable section
    with st.expander("🗂️ Dataset Members", expanded=False):
        members_cols = st.columns(len(ensemble.members))
        for idx, member in enumerate(ensemble.members):
            with members_cols[idx]:
                st.markdown(f"✓ **{member}**")
    # Display dataset ID in a styled container
    st.info(f"**Dataset ID:** `{ensemble.id()}`")
    # Display detailed dataset statistics
    st.markdown("---")
    st.markdown("### 📈 Dataset Statistics")
    with st.spinner("Computing dataset statistics..."):
        stats = ensemble.get_stats()
    # High-level summary metrics
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric(label="Total Samples", value=f"{stats['num_target_samples']:,}")
    with col2:
        st.metric(label="Total Features", value=f"{stats['total_features']:,}")
    with col3:
        st.metric(label="Data Sources", value=len(stats["members"]))
    # Detailed member statistics in expandable section
    with st.expander("📦 Data Source Details", expanded=False):
        for member, member_stats in stats["members"].items():
            st.markdown(f"### {member}")
            # Create metrics for this member
            metric_cols = st.columns(4)
            with metric_cols[0]:
                st.metric("Features", member_stats["num_features"])
            with metric_cols[1]:
                st.metric("Variables", member_stats["num_variables"])
            with metric_cols[2]:
                # Display dimensions in a more readable format
                dim_str = " x ".join([f"{dim}" for dim in member_stats["dimensions"].values()])  # type: ignore[union-attr]
                st.metric("Shape", dim_str)
            with metric_cols[3]:
                # Calculate total data points
                total_points = 1
                for dim_size in member_stats["dimensions"].values():  # type: ignore[union-attr]
                    total_points *= dim_size
                st.metric("Data Points", f"{total_points:,}")
            # Show variables as colored badges
            st.markdown("**Variables:**")
            vars_html = " ".join(
                [
                    f'<span style="background-color: #e3f2fd; color: #1976d2; padding: 4px 8px; '
                    f'border-radius: 4px; margin: 2px; display: inline-block; font-size: 0.9em;">{v}</span>'
                    for v in member_stats["variables"]  # type: ignore[union-attr]
                ]
            )
            st.markdown(vars_html, unsafe_allow_html=True)
            # Show dimension details
            st.markdown("**Dimensions:**")
            dim_html = " ".join(
                [
                    f'<span style="background-color: #f3e5f5; color: #7b1fa2; padding: 4px 8px; '
                    f'border-radius: 4px; margin: 2px; display: inline-block; font-size: 0.9em;">'
                    f"{dim_name}: {dim_size}</span>"
                    for dim_name, dim_size in member_stats["dimensions"].items()  # type: ignore[union-attr]
                ]
            )
            st.markdown(dim_html, unsafe_allow_html=True)
            st.markdown("---")
 def render_labels_view(ensemble: DatasetEnsemble, train_data_dict: dict[Task, CategoricalTrainingDataset]):
    """Render target labels distribution and spatial visualization.
    Args:
        ensemble: The dataset ensemble configuration.
        train_data_dict: Pre-loaded training data for all tasks.
    """
    st.markdown("### Target Labels Distribution and Spatial Visualization")
    # Calculate total samples (use binary as reference)
    total_samples = len(train_data_dict["binary"])
    train_samples = (train_data_dict["binary"].split == "train").sum().item()
    test_samples = (train_data_dict["binary"].split == "test").sum().item()
    st.success(f"Loaded {total_samples} samples ({train_samples} train, {test_samples} test) for all three tasks")
    # Render distribution histograms
    st.markdown("---")
    render_all_distribution_histograms(train_data_dict)  # type: ignore[arg-type]
    st.markdown("---")
    # Render spatial map
    binary_dataset = train_data_dict["binary"]
    assert "geometry" in binary_dataset.dataset.columns, "Geometry column missing in dataset"
    render_spatial_map(train_data_dict)
 def render_areas_view(ensemble: DatasetEnsemble, grid_gdf):
    """Render grid cell areas and land/water distribution.
    Args:
        ensemble: The dataset ensemble configuration.
        grid_gdf: Pre-loaded grid GeoDataFrame.
    """
    st.markdown("### Grid Cell Areas and Land/Water Distribution")
    st.markdown(
        "This visualization shows the spatial distribution of cell areas, land areas, "
        "water areas, and land ratio across the grid. The grid has been filtered to "
        "include only cells in the permafrost region (>50° latitude, <85° latitude) "
        "with >10% land coverage."
    )
    st.success(
        f"Loaded {len(grid_gdf)} grid cells with areas ranging from "
        f"{grid_gdf['cell_area'].min():.2f} to {grid_gdf['cell_area'].max():.2f} km²"
    )
    # Show summary statistics
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("Total Cells", f"{len(grid_gdf):,}")
    with col2:
        st.metric("Avg Cell Area", f"{grid_gdf['cell_area'].mean():.2f} km²")
    with col3:
        st.metric("Avg Land Ratio", f"{grid_gdf['land_ratio'].mean():.1%}")
    with col4:
        total_land = grid_gdf["land_area"].sum()
        st.metric("Total Land Area", f"{total_land:,.0f} km²")
    st.markdown("---")
    # Check if we should skip map rendering for performance
    if (ensemble.grid == "hex" and ensemble.level == 6) or (ensemble.grid == "healpix" and ensemble.level == 10):
        st.warning(
            "🗺️ Spatial map rendering is disabled for this grid configuration (hex-6 or healpix-10) "
            "due to performance considerations."
        )
    else:
        render_areas_map(grid_gdf, ensemble.grid)
 def render_alphaearth_view(ensemble: DatasetEnsemble, alphaearth_ds, targets):
    """Render AlphaEarth embeddings analysis.
    Args:
        ensemble: The dataset ensemble configuration.
        alphaearth_ds: Pre-loaded AlphaEarth dataset.
        targets: Pre-loaded targets GeoDataFrame.
    """
    st.markdown("### AlphaEarth Embeddings Analysis")
    st.success(f"Loaded AlphaEarth data with {len(alphaearth_ds['cell_ids'])} cells")
    render_alphaearth_overview(alphaearth_ds)
    render_alphaearth_plots(alphaearth_ds)
    st.markdown("---")
    # Check if we should skip map rendering for performance
    if (ensemble.grid == "hex" and ensemble.level == 6) or (ensemble.grid == "healpix" and ensemble.level == 10):
        st.warning(
            "🗺️ Spatial map rendering is disabled for this grid configuration (hex-6 or healpix-10) "
            "due to performance considerations."
        )
    else:
        render_alphaearth_map(alphaearth_ds, targets, ensemble.grid)
 def render_arcticdem_view(ensemble: DatasetEnsemble, arcticdem_ds, targets):
    """Render ArcticDEM terrain analysis.
    Args:
        ensemble: The dataset ensemble configuration.
        arcticdem_ds: Pre-loaded ArcticDEM dataset.
        targets: Pre-loaded targets GeoDataFrame.
    """
    st.markdown("### ArcticDEM Terrain Analysis")
    st.success(f"Loaded ArcticDEM data with {len(arcticdem_ds['cell_ids'])} cells")
    render_arcticdem_overview(arcticdem_ds)
    render_arcticdem_plots(arcticdem_ds)
    st.markdown("---")
    # Check if we should skip map rendering for performance
    if (ensemble.grid == "hex" and ensemble.level == 6) or (ensemble.grid == "healpix" and ensemble.level == 10):
        st.warning(
            "🗺️ Spatial map rendering is disabled for this grid configuration (hex-6 or healpix-10) "
            "due to performance considerations."
        )
    else:
        render_arcticdem_map(arcticdem_ds, targets, ensemble.grid)
@st.fragment
 def render_era5_view(ensemble: DatasetEnsemble, era5_data: dict[L2SourceDataset, tuple], targets):
    """Render ERA5 climate data analysis.
    Args:
        ensemble: The dataset ensemble configuration.
        era5_data: Dictionary mapping ERA5 member names to (dataset, temporal_type) tuples.
        targets: Pre-loaded targets GeoDataFrame.
    """
    st.markdown("### ERA5 Climate Data Analysis")
    # Let user select which ERA5 temporal aggregation to view
    era5_options = {
        "ERA5-yearly": "Yearly",
        "ERA5-seasonal": "Seasonal (Winter/Summer)",
        "ERA5-shoulder": "Shoulder Seasons (JFM/AMJ/JAS/OND)",
    }
    available_era5 = {k: v for k, v in era5_options.items() if k in era5_data}
    selected_era5 = st.selectbox(
        "Select ERA5 temporal aggregation",
        options=list(available_era5.keys()),
        format_func=lambda x: available_era5[x],
        key="era5_temporal_select",
    )
    if selected_era5 and selected_era5 in era5_data:
        era5_ds, temporal_type = era5_data[selected_era5]
        render_era5_overview(era5_ds, temporal_type)
        render_era5_plots(era5_ds, temporal_type)
        st.markdown("---")
        # Check if we should skip map rendering for performance
        if (ensemble.grid == "hex" and ensemble.level == 6) or (ensemble.grid == "healpix" and ensemble.level == 10):
            st.warning(
                "🗡️ Spatial map rendering is disabled for this grid configuration (hex-6 or healpix-10) "
                "due to performance considerations."
            )
        else:
            render_era5_map(era5_ds, targets, ensemble.grid, temporal_type)
 def render_training_data_page():
    """Render the Training Data page of the dashboard."""
    st.title("🎯 Training Data")
    st.markdown(
        """
        Explore and visualize the training data for RTS prediction models.
        Configure your dataset by selecting grid configuration, target dataset,
        and data sources in the sidebar, then click "Load Dataset" to begin.
        """
    )
    # Render sidebar configuration
    render_dataset_configuration_sidebar()
    # Check if dataset is loaded in session state
    if not st.session_state.get("dataset_loaded", False) or "dataset_ensemble" not in st.session_state:
        st.info(
            "👈 Configure the dataset settings in the sidebar and click 'Load Dataset' to begin exploring training data"
        )
        return
    # Get ensemble from session state
    ensemble: DatasetEnsemble = st.session_state["dataset_ensemble"]
    st.divider()
    # Load all necessary data once
    with st.spinner("Loading dataset..."):
        # Load training data for all tasks
        train_data_dict = load_all_training_data(ensemble)
        # Load grid data
        grid_gdf = grids.open(ensemble.grid, ensemble.level)
        # Load targets (needed by all source data views)
        targets = ensemble._read_target()
        # Load AlphaEarth data if in members
        alphaearth_ds = None
        if "AlphaEarth" in ensemble.members:
            alphaearth_ds, _ = load_source_data(ensemble, "AlphaEarth")
        # Load ArcticDEM data if in members
        arcticdem_ds = None
        if "ArcticDEM" in ensemble.members:
            arcticdem_ds, _ = load_source_data(ensemble, "ArcticDEM")
        # Load ERA5 data for all temporal aggregations in members
        era5_data = {}
        era5_members = [m for m in ensemble.members if m.startswith("ERA5")]
        for era5_member in era5_members:
            era5_ds, _ = load_source_data(ensemble, era5_member)
            temporal_type = era5_member.split("-")[1]  # 'yearly', 'seasonal', or 'shoulder'
            era5_data[era5_member] = (era5_ds, temporal_type)
    st.success(
        f"Loaded dataset with {len(train_data_dict['binary'])} samples and {ensemble.get_stats()['total_features']} features"
    )
    # Render dataset statistics
    render_dataset_statistics(ensemble)
    st.markdown("---")
    # Create tabs for different data views
    tab_names = ["📊 Labels", "📐 Areas"]
    # Add tabs for each member based on what's in the ensemble
    if "AlphaEarth" in ensemble.members:
        tab_names.append("🌍 AlphaEarth")
    if "ArcticDEM" in ensemble.members:
        tab_names.append("🏔️ ArcticDEM")
    # Check for ERA5 members
    if era5_members:
        tab_names.append("🌡️ ERA5")
    tabs = st.tabs(tab_names)
    # Track current tab index
    tab_idx = 0
    # Labels tab
    with tabs[tab_idx]:
        render_labels_view(ensemble, train_data_dict)
    tab_idx += 1
    # Areas tab
    with tabs[tab_idx]:
        render_areas_view(ensemble, grid_gdf)
    tab_idx += 1
    # AlphaEarth tab
    if "AlphaEarth" in ensemble.members:
        with tabs[tab_idx]:
            render_alphaearth_view(ensemble, alphaearth_ds, targets)
        tab_idx += 1
    # ArcticDEM tab
    if "ArcticDEM" in ensemble.members:
        with tabs[tab_idx]:
            render_arcticdem_view(ensemble, arcticdem_ds, targets)
        tab_idx += 1
    # ERA5 tab (combining all temporal variants)
    if era5_members:
        with tabs[tab_idx]:
            render_era5_view(ensemble, era5_data, targets)
    # Show balloons once after all tabs are rendered
    st.balloons()
    stopwatch.summary()