More plot refinements

This commit is contained in:
Tobias Hölzer 2025-12-23 23:33:54 +01:00
parent a64e1ac41f
commit 591da6992e
6 changed files with 298 additions and 32 deletions

View file

@ -73,7 +73,7 @@ def download(grid: Literal["hex", "healpix"], level: int):
print(f"Using scale factor of {scale_factor} for grid {grid} at level {level}.")
# 2024-2025 for hex-6
for year in track(range(2022, 2025), total=3, description="Processing years..."):
for year in track(range(2024, 2025), total=1, description="Processing years..."):
embedding_collection = ee.ImageCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL")
embedding_collection = embedding_collection.filterDate(f"{year}-01-01", f"{year}-12-31")
aggs = ["mean", "stdDev", "min", "max", "count", "median", "p1", "p5", "p25", "p75", "p95", "p99"]

View file

@ -120,11 +120,12 @@ def render_performance_summary(results: pd.DataFrame, refit_metric: str):
)
def render_parameter_distributions(results: pd.DataFrame):
def render_parameter_distributions(results: pd.DataFrame, settings: dict | None = None):
"""Render histograms of parameter distributions explored.
Args:
results: DataFrame with CV results.
settings: Optional settings dictionary containing param_grid configuration.
"""
# Get parameter columns
@ -134,6 +135,20 @@ def render_parameter_distributions(results: pd.DataFrame):
st.warning("No parameter columns found in results.")
return
# Extract scale information from settings if available
param_scales = {}
if settings and "param_grid" in settings:
param_grid = settings["param_grid"]
for param_name, param_config in param_grid.items():
if isinstance(param_config, dict) and "distribution" in param_config:
# loguniform distribution indicates log scale
if param_config["distribution"] == "loguniform":
param_scales[param_name] = "log"
else:
param_scales[param_name] = "linear"
else:
param_scales[param_name] = "linear"
# Get colormap from colors module
cmap = get_cmap("parameter_distribution")
bar_color = mcolors.rgb2hex(cmap(0.5))
@ -178,6 +193,13 @@ def render_parameter_distributions(results: pd.DataFrame):
# Determine number of bins based on unique values
n_bins = min(20, max(5, n_unique))
# Determine x-axis scale from config or infer from data
use_log_scale = param_scales.get(param_name, "linear") == "log"
if not use_log_scale:
# Fall back to automatic detection if not in config
value_range = max_val / (param_values.min() + 1e-10)
use_log_scale = value_range > 100 or max_val < 0.01
# For very small values or large ranges, use a simple bar chart instead of histogram
if n_unique <= 10:
# Few unique values - use bar chart
@ -185,6 +207,9 @@ def render_parameter_distributions(results: pd.DataFrame):
value_counts.columns = [param_name, "count"]
value_counts = value_counts.sort_values(param_name)
x_scale = alt.Scale(type="log") if use_log_scale else alt.Scale()
title_suffix = " (log scale)" if use_log_scale else ""
# Format x-axis values
if max_val < 0.01:
formatted_col = f"{param_name}_formatted"
@ -200,25 +225,72 @@ def render_parameter_distributions(results: pd.DataFrame):
alt.Tooltip("count", title="Count"),
],
)
.properties(height=250, title=param_name)
.properties(height=250, title=f"{param_name}{title_suffix}")
)
else:
chart = (
alt.Chart(value_counts)
.mark_bar(color=bar_color)
.encode(
alt.X(f"{param_name}:Q", title=param_name),
alt.X(f"{param_name}:Q", title=param_name, scale=x_scale),
alt.Y("count:Q", title="Count"),
tooltip=[
alt.Tooltip(param_name, format=".3f"),
alt.Tooltip("count", title="Count"),
],
)
.properties(height=250, title=param_name)
.properties(height=250, title=f"{param_name}{title_suffix}")
)
else:
# Many unique values - use binned histogram
# Avoid log scale for binning as it can cause issues
title_suffix = " (log scale)" if use_log_scale else ""
if use_log_scale:
# For log scale parameters, create bins in log space then transform back
# This gives better distribution visualization
log_values = np.log10(param_values.to_numpy())
log_bins = np.linspace(log_values.min(), log_values.max(), n_bins + 1)
bins_linear = 10**log_bins
# Manually bin the data
binned = pd.cut(param_values, bins=bins_linear)
bin_counts = binned.value_counts().sort_index()
# Create dataframe for plotting
bin_data = []
for interval, count in bin_counts.items():
bin_mid = (interval.left + interval.right) / 2
bin_data.append(
{
param_name: bin_mid,
"count": count,
"bin_label": f"{interval.left:.2e} - {interval.right:.2e}",
}
)
df_binned = pd.DataFrame(bin_data)
chart = (
alt.Chart(df_binned)
.mark_bar(color=bar_color)
.encode(
alt.X(
f"{param_name}:Q",
title=param_name,
scale=alt.Scale(type="log"),
axis=alt.Axis(format=".2e"),
),
alt.Y("count:Q", title="Count"),
tooltip=[
alt.Tooltip("bin_label:N", title="Range"),
alt.Tooltip("count:Q", title="Count"),
],
)
.properties(height=250, title=f"{param_name}{title_suffix}")
)
else:
# Linear scale - use standard binning
format_str = ".2e" if max_val < 0.01 else ".3f"
chart = (
alt.Chart(df_plot)
.mark_bar(color=bar_color)
@ -226,7 +298,7 @@ def render_parameter_distributions(results: pd.DataFrame):
alt.X(f"{param_name}:Q", bin=alt.Bin(maxbins=n_bins), title=param_name),
alt.Y("count()", title="Count"),
tooltip=[
alt.Tooltip(f"{param_name}:Q", format=".2e" if max_val < 0.01 else ".3f", bin=True),
alt.Tooltip(f"{param_name}:Q", format=format_str, bin=True),
"count()",
],
)
@ -778,6 +850,184 @@ def render_multi_metric_comparison(results: pd.DataFrame):
st.metric(f"Correlation between {metric1} and {metric2}", f"{corr:.3f}")
def render_espa_binned_parameter_space(results: pd.DataFrame, metric: str, k_bin_width: int = 40):
"""Render ESPA-specific binned parameter space plots.
Creates faceted plots for all combinations of the three ESPA parameters:
- eps_cl vs eps_e (binned by initial_K)
- eps_cl vs initial_K (binned by eps_e)
- eps_e vs initial_K (binned by eps_cl)
Args:
results: DataFrame with CV results.
metric: The metric to visualize (e.g., 'f1', 'accuracy').
k_bin_width: Width of bins for initial_K parameter.
"""
score_col = f"mean_test_{metric}"
if score_col not in results.columns:
st.warning(f"Metric {metric} not found in results.")
return
# Check if this is an ESPA model with the required parameters
required_params = ["param_initial_K", "param_eps_cl", "param_eps_e"]
if not all(param in results.columns for param in required_params):
st.info("ESPA-specific parameters not found. This visualization is only for ESPA models.")
return
# Get colormap from colors module
hex_colors = get_palette(metric, n_colors=256)
# Prepare base plot data
base_data = results[["param_eps_e", "param_eps_cl", "param_initial_K", score_col]].copy()
base_data = base_data.dropna()
if len(base_data) == 0:
st.warning("No data available for ESPA binned parameter space.")
return
# Configuration for each plot combination
plot_configs = [
{
"x_param": "param_eps_e",
"y_param": "param_eps_cl",
"bin_param": "param_initial_K",
"x_label": "eps_e",
"y_label": "eps_cl",
"bin_label": "initial_K",
"x_scale": "log",
"y_scale": "log",
"bin_type": "linear",
"bin_width": k_bin_width,
"title": "eps_cl vs eps_e (binned by initial_K)",
},
{
"x_param": "param_eps_cl",
"y_param": "param_initial_K",
"bin_param": "param_eps_e",
"x_label": "eps_cl",
"y_label": "initial_K",
"bin_label": "eps_e",
"x_scale": "log",
"y_scale": "linear",
"bin_type": "log",
"bin_width": None, # Will use log bins
"title": "initial_K vs eps_cl (binned by eps_e)",
},
{
"x_param": "param_eps_e",
"y_param": "param_initial_K",
"bin_param": "param_eps_cl",
"x_label": "eps_e",
"y_label": "initial_K",
"bin_label": "eps_cl",
"x_scale": "log",
"y_scale": "linear",
"bin_type": "log",
"bin_width": None, # Will use log bins
"title": "initial_K vs eps_e (binned by eps_cl)",
},
]
# Create each plot
for config in plot_configs:
st.markdown(f"**{config['title']}**")
plot_data = base_data.copy()
# Create bins for the binning parameter
bin_values = plot_data[config["bin_param"]].dropna()
if len(bin_values) == 0:
st.warning(f"No {config['bin_label']} values found.")
continue
if config["bin_type"] == "log":
# Logarithmic binning for epsilon parameters
log_min = np.log10(bin_values.min())
log_max = np.log10(bin_values.max())
n_bins = min(10, max(5, int(log_max - log_min) + 1))
bins = np.logspace(log_min, log_max, num=n_bins)
# Adjust bins to ensure all values are captured
bins[0] = bins[0] * 0.999 # Extend first bin to capture minimum
bins[-1] = bins[-1] * 1.001 # Extend last bin to capture maximum
else:
# Linear binning for initial_K
bin_min = bin_values.min()
bin_max = bin_values.max()
bins = np.arange(bin_min, bin_max + config["bin_width"], config["bin_width"])
# Bin the parameter
plot_data["binned_param"] = pd.cut(plot_data[config["bin_param"]], bins=bins, right=False)
# Remove any NaN bins (shouldn't happen, but just in case)
plot_data = plot_data.dropna(subset=["binned_param"])
# Sort bins and convert to string for ordering
plot_data = plot_data.sort_values("binned_param")
plot_data["binned_param_str"] = plot_data["binned_param"].astype(str)
bin_order = plot_data["binned_param_str"].unique().tolist()
# Create faceted scatter plot
x_scale = alt.Scale(type=config["x_scale"]) if config["x_scale"] == "log" else alt.Scale()
# For initial_K plots, set domain to start from the minimum value in the search space
if config["y_label"] == "initial_K":
y_min = plot_data[config["y_param"]].min()
y_max = plot_data[config["y_param"]].max()
y_scale = alt.Scale(domain=[y_min, y_max])
else:
y_scale = alt.Scale(type=config["y_scale"]) if config["y_scale"] == "log" else alt.Scale()
chart = (
alt.Chart(plot_data)
.mark_circle(size=60, opacity=0.7)
.encode(
x=alt.X(
f"{config['x_param']}:Q",
scale=x_scale,
axis=alt.Axis(title=config["x_label"], grid=True, gridOpacity=0.5),
),
y=alt.Y(
f"{config['y_param']}:Q",
scale=y_scale,
axis=alt.Axis(title=config["y_label"], grid=True, gridOpacity=0.5),
),
color=alt.Color(
f"{score_col}:Q",
scale=alt.Scale(range=hex_colors),
title=metric.replace("_", " ").title(),
),
tooltip=[
alt.Tooltip("param_eps_e:Q", title="eps_e", format=".2e"),
alt.Tooltip("param_eps_cl:Q", title="eps_cl", format=".2e"),
alt.Tooltip("param_initial_K:Q", title="initial_K", format=".0f"),
alt.Tooltip(f"{score_col}:Q", title=metric, format=".4f"),
alt.Tooltip("binned_param_str:N", title=f"{config['bin_label']} Bin"),
],
)
.properties(width=200, height=200)
.facet(
facet=alt.Facet(
"binned_param_str:N",
title=f"{config['bin_label']} (binned)",
sort=bin_order,
),
columns=5,
)
)
st.altair_chart(chart, use_container_width=True)
# Show statistics about the binning
n_bins = len(bin_order)
n_total = len(plot_data)
st.caption(
f"Showing {n_total} data points across {n_bins} bins of {config['bin_label']}. "
f"Each facet shows {config['y_label']} vs {config['x_label']} for a range of {config['bin_label']} values."
)
st.write("") # Add some spacing between plots
def render_top_configurations(results: pd.DataFrame, metric: str, top_n: int = 10):
"""Render table of top N configurations.

View file

@ -4,6 +4,7 @@ import streamlit as st
from entropice.dashboard.plots.hyperparameter_analysis import (
render_binned_parameter_space,
render_espa_binned_parameter_space,
render_multi_metric_comparison,
render_parameter_correlation,
render_parameter_distributions,
@ -168,10 +169,23 @@ def render_training_analysis_page():
# Parameter distributions
st.subheader("📈 Parameter Distributions")
render_parameter_distributions(results)
render_parameter_distributions(results, settings)
# Binned parameter space plots
st.subheader("🎨 Binned Parameter Space")
# Check if this is an ESPA model and show ESPA-specific plots
model_type = settings.get("model", "espa")
if model_type == "espa":
# Show ESPA-specific binned plots (eps_cl vs eps_e binned by K)
render_espa_binned_parameter_space(results, selected_metric)
# Optionally show the generic binned plots in an expander
with st.expander("📊 All Parameter Combinations", expanded=False):
st.caption("Generic parameter space exploration (all pairwise combinations)")
render_binned_parameter_space(results, selected_metric)
else:
# For non-ESPA models, show the generic binned plots
render_binned_parameter_space(results, selected_metric)
st.divider()

View file

@ -487,11 +487,14 @@ class DatasetEnsemble:
X_test = cp.asarray(X_test)
y_train = cp.asarray(y_train)
y_test = cp.asarray(y_test)
print(f"Using CUDA device: {cp.cuda.runtime.getDeviceProperties(0)['name'].decode()}")
elif device == "torch":
X_train = torch.from_numpy(X_train)
X_test = torch.from_numpy(X_test)
y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train = torch.from_numpy(X_train).to(device=torch_device)
X_test = torch.from_numpy(X_test).to(device=torch_device)
y_train = torch.from_numpy(y_train).to(device=torch_device)
y_test = torch.from_numpy(y_test).to(device=torch_device)
print(f"Using torch device: {torch.cuda.get_device_name(X_train.device) if X_train.is_cuda else 'cpu'}")
else:
assert device == "cpu", "Invalid device specified."

View file

@ -9,7 +9,7 @@ from typing import Literal
DATA_DIR = (
Path(os.environ.get("FAST_DATA_DIR", None) or os.environ.get("DATA_DIR", None) or "data").resolve() / "entropice"
)
# DATA_DIR = Path("/raid/scratch/tohoel001/data/entropice") # Temporary hardcoding for FAST cluster
DATA_DIR = Path("/raid/scratch/tohoel001/data/entropice") # Temporary hardcoding for FAST cluster
GRIDS_DIR = DATA_DIR / "grids"
FIGURES_DIR = Path("figures")

View file

@ -15,6 +15,7 @@ from entropy import ESPAClassifier
from rich import pretty, traceback
from scipy.stats import loguniform, randint
from scipy.stats._distn_infrastructure import rv_continuous_frozen, rv_discrete_frozen
from sklearn import set_config
from sklearn.model_selection import KFold, RandomizedSearchCV
from stopuhr import stopwatch
from xgboost.sklearn import XGBClassifier
@ -28,7 +29,7 @@ pretty.install()
# Disabled array_api_dispatch to avoid namespace conflicts between NumPy and CuPy
# when using XGBoost with device="cuda"
# set_config(array_api_dispatch=True)
set_config(array_api_dispatch=True)
cli = cyclopts.App("entropice-training", config=cyclopts.config.Toml("training-config.toml")) # ty:ignore[invalid-argument-type]
@ -78,10 +79,8 @@ def _create_clf(
elif settings.model == "xgboost":
param_grid = {
"learning_rate": loguniform(1e-4, 1e-1),
"max_depth": randint(3, 15),
"n_estimators": randint(100, 1000),
"subsample": loguniform(0.5, 1.0),
"colsample_bytree": loguniform(0.5, 1.0),
"max_depth": randint(5, 50),
"n_estimators": randint(50, 1000),
}
clf = XGBClassifier(
objective="multi:softprob" if settings.task != "binary" else "binary:logistic",
@ -94,9 +93,9 @@ def _create_clf(
elif settings.model == "rf":
param_grid = {
"max_depth": randint(5, 50),
"n_estimators": randint(50, 500),
"n_estimators": randint(50, 1000),
}
clf = RandomForestClassifier(random_state=42)
clf = RandomForestClassifier(random_state=42, split_criterion="entropy")
fit_params = {}
elif settings.model == "knn":
param_grid = {