Refactor

2025-10-26 18:28:46 +01:00 · 2025-10-26 18:28:46 +01:00 · eeab8fff1e
commit eeab8fff1e
parent 3ad332b5a8
16 changed files with 536 additions and 943 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,5 +1,5 @@
 [project]
-name = "entropic-perma-risk"
+name = "entropice"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
@ -49,10 +49,14 @@ dependencies = [
 ]

 [project.scripts]
-create-grid = "steps.s0_0_grids.create_grid:main"
-rts = "steps.s0_1_rts.rts:main"
-alpha-earth = "steps.s1_0_alphaearth.alphaearth:main"
-era5 = "steps.s1_1_era5.era5:cli"
+create-grid = "entropice.grids:main"
+darts = "entropice.darts:main"
+alpha-earth = "entropice.alphaearth:main"
+era5 = "entropice.era5:cli"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"

 [tool.uv]
 package = true
--- a/steps/s1_0_alphaearth/alphaearth.sh
+++ b/steps/s1_0_alphaearth/alphaearth.sh
--- a/steps/s1_1_era5/cds.py
+++ b/steps/s1_1_era5/cds.py
--- a/steps/s1_1_era5/era5.sh
+++ b/steps/s1_1_era5/era5.sh
--- a/steps/s0_1_rts/rts.sh
+++ b/steps/s0_1_rts/rts.sh
--- a/src/entropice/init.py
+++ b/src/entropice/init.py
@ -0,0 +1,2 @@
+def hello() -> str:
+    return "Hello from entropice!"
--- a/steps/s1_0_alphaearth/alphaearth.py
+++ b/steps/s1_0_alphaearth/alphaearth.py
@ -4,9 +4,7 @@ Author: Tobias Hölzer
 Date: October 2025
 """

-import os
 import warnings
-from pathlib import Path
 from typing import Literal

 import cyclopts
@ -16,9 +14,13 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 import xarray as xr
+import xdggs
 from rich import pretty, print, traceback
 from rich.progress import track

+from entropice import codecs, grids
+from entropice.paths import get_annual_embeddings_file, get_embeddings_store
+
 # Filter out the GeoDataFrame.swapaxes deprecation warning
 warnings.filterwarnings("ignore", message=".*GeoDataFrame.swapaxes.*", category=FutureWarning)

@ -26,26 +28,19 @@ pretty.install()
 traceback.install()
 ee.Initialize(project="ee-tobias-hoelzer")

-DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
-EMBEDDINGS_DIR = DATA_DIR / "embeddings"
-EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
-
-
 cli = cyclopts.App(name="alpha-earth")


@cli.command()
-def download(grid: Literal["hex", "healpix"], level: int, backup_intermediate: bool = False):
+def download(grid: Literal["hex", "healpix"], level: int):
    """Extract satellite embeddings from Google Earth Engine and map them to a grid.

    Args:
        grid (Literal["hex", "healpix"]): The grid type to use.
        level (int): The grid level to use.
-        backup_intermediate (bool, optional): Whether to backup intermediate results. Defaults to False.

    """
-    gridname = f"permafrost_{grid}{level}"
-    grid = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")
+    grid_gdf = grids.open(grid, level)

    for year in track(range(2017, 2025), total=8, description="Processing years..."):
        embedding_collection = ee.ImageCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL")
@ -72,9 +67,9 @@ def download(grid: Literal["hex", "healpix"], level: int, backup_intermediate: b
        # Process grid in batches of 100
        batch_size = 100
        all_results = []
-        n_batches = len(grid) // batch_size
+        n_batches = len(grid_gdf) // batch_size
        for batch_num, batch_grid in track(
-            enumerate(np.array_split(grid, n_batches)),
+            enumerate(np.array_split(grid_gdf, n_batches)),
            description="Processing batches...",
            total=n_batches,
        ):
@ -88,18 +83,12 @@ def download(grid: Literal["hex", "healpix"], level: int, backup_intermediate: b
            # Store batch results
            all_results.append(df_batch)

-            # Save batch immediately to disk as backup
-            if backup_intermediate:
-                batch_filename = f"{gridname}_embeddings-{year}_batch{batch_num:06d}.parquet"
-                batch_result = batch_grid.merge(df_batch[[*bands, "cell_id"]], on="cell_id", how="left")
-                batch_result.to_parquet(EMBEDDINGS_DIR / f"{batch_filename}")
-
        # Combine all batch results
        df = pd.concat(all_results, ignore_index=True)
        embeddings_on_grid = grid.merge(df[[*bands, "cell_id"]], on="cell_id", how="left")
-        embeddings_file = EMBEDDINGS_DIR / f"{gridname}_embeddings-{year}.parquet"
+        embeddings_file = get_annual_embeddings_file(grid, level, year)
        embeddings_on_grid.to_parquet(embeddings_file)
-        print(f"Saved embeddings for year {year} to {embeddings_file.resolve()}.")
+        print(f"Saved embeddings for year {year} to {embeddings_file}.")


@cli.command()
@ -111,36 +100,38 @@ def combine_to_zarr(grid: Literal["hex", "healpix"], level: int):
        level (int): The grid level to use.

    """
-    embs = gpd.read_parquet(DATA_DIR / "embeddings" / f"permafrost_{grid}{level}_embeddings-2017.parquet")
-    # ? Converting cell IDs from hex strings to integers for xdggs compatibility
-    cells = [int(cid, 16) for cid in embs.cell_id.to_list()]
+    cell_ids = grids.get_cell_ids(grid, level)
    years = list(range(2017, 2025))
    aggs = ["median", "stdDev", "min", "max", "mean", "p1", "p5", "p25", "p75", "p95", "p99"]
    bands = [f"A{str(i).zfill(2)}" for i in range(64)]

    a = xr.DataArray(
        np.nan,
-        dims=("year", "cell", "band", "agg"),
-        coords={"year": years, "cell": cells, "band": bands, "agg": aggs},
-    )
+        dims=("year", "cell_ids", "band", "agg"),
+        coords={"year": years, "cell_ids": cell_ids, "band": bands, "agg": aggs},
+    ).astype(np.float32)
+
    # ? These attributes are needed for xdggs
-    a.cell.attrs = {
+    a.cell_ids.attrs = {
        "grid_name": "h3" if grid == "hex" else "healpix",
        "level": level,
    }
    if grid == "healpix":
-        a.cell.attrs["indexing_scheme"] = "nested"
+        a.cell_ids.attrs["indexing_scheme"] = "nested"

    for year in track(years, total=len(years), description="Processing years..."):
-        embs = gpd.read_parquet(DATA_DIR / "embeddings" / f"permafrost_{grid}{level}_embeddings-{year}.parquet")
+        embeddings_file = get_annual_embeddings_file(grid, level, year)
+        embs = gpd.read_parquet(embeddings_file)
        for band in bands:
            for agg in aggs:
                col = f"{band}_{agg}"
                a.loc[{"band": band, "agg": agg, "year": year}] = embs[col].to_list()

-    zarr_path = EMBEDDINGS_DIR / f"permafrost_{grid}{level}_embeddings.zarr"
-    a.to_zarr(zarr_path, consolidated=False, mode="w")
-    print(f"Saved combined embeddings to {zarr_path.resolve()}.")
+    a = xdggs.decode(a)
+
+    zarr_path = get_embeddings_store(grid, level)
+    a.to_zarr(zarr_path, consolidated=False, mode="w", encoding=codecs.from_ds(a))
+    print(f"Saved combined embeddings to {zarr_path}.")


 def main():  # noqa: D103
--- a/src/entropice/codecs.py
+++ b/src/entropice/codecs.py
@ -0,0 +1,31 @@
+"""Encoding utilities for zarr dataset storage."""
+
+import xarray as xr
+from zarr.codecs import BloscCodec
+
+
+def from_ds(ds: xr.Dataset, store_floats_as_float32: bool = True, include_coords: bool = True) -> dict:
+    """Create compression encoding for zarr dataset storage.
+
+    Creates Blosc compression configuration for all data variables and coordinates
+    in the dataset using zstd compression with level 5.
+
+    Args:
+        ds (xr.Dataset): The xarray Dataset to create encoding for.
+        store_floats_as_float32 (bool, optional): Whether to store floating point data as float32.
+            Defaults to True.
+        include_coords (bool, optional): Whether to include coordinates in the encoding.
+            This is useful when appending to an existing store.
+            Defaults to True.
+
+    Returns:
+        dict: Encoding dictionary with compression settings for each variable.
+
+    """
+    var_names = [*ds.data_vars, *ds.coords] if include_coords else ds.data_vars
+    encoding = {var: {"compressors": BloscCodec(cname="zstd", clevel=5)} for var in var_names}
+    if store_floats_as_float32:
+        for var in ds.data_vars:
+            if ds[var].dtype == "float64":
+                encoding[var]["dtype"] = "float32"
+    return encoding
--- a/src/entropice/darts.py
+++ b/src/entropice/darts.py
@ -0,0 +1,72 @@
+"""Labels of Retrogressive-Thaw-Slumps (RTS).
+
+Assumes that the level 1 and level 2 DARTS features have been downloaded into $DATA_DIR / entropyc-rts / darts: https://arcticdata.io/catalog/view/doi:10.18739/A22B8VD7C
+
+Author: Tobias Hölzer
+Date: October 2025
+"""
+
+from typing import Literal
+
+import cyclopts
+import geopandas as gpd
+from rich import pretty, print, traceback
+from rich.progress import track
+from stopuhr import stopwatch
+
+from entropice import grids
+from entropice.paths import dartsl2_cov_file, dartsl2_file, get_darts_rts_file
+
+traceback.install()
+pretty.install()
+
+
+def extract_darts_rts(grid: Literal["hex", "healpix"], level: int):
+    """Extract RTS labels from DARTS dataset.
+
+    Args:
+        grid (Literal["hex", "healpix"]): The grid type to use.
+        level (int): The grid level to use.
+
+    """
+    with stopwatch("Load data"):
+        darts_l2 = gpd.read_parquet(dartsl2_file)
+        darts_cov_l2 = gpd.read_parquet(dartsl2_cov_file)
+        grid_gdf = grids.open(grid, level)
+
+    with stopwatch("Extract RTS labels"):
+        grid_l2 = grid_gdf.overlay(darts_l2.to_crs(grid_gdf.crs), how="intersection")
+        grid_cov_l2 = grid_gdf.overlay(darts_cov_l2.to_crs(grid_gdf.crs), how="intersection")
+
+    years = list(grid_cov_l2["year"].unique())
+    for year in track(years, total=len(years), description="Processing years..."):
+        with stopwatch("Processing RTS", log=False):
+            subset = grid_l2[grid_l2["year"] == year]
+            subset_cov = grid_cov_l2[grid_cov_l2["year"] == year]
+
+            counts = subset.groupby("cell_id").size()
+            grid_gdf[f"darts_{year}_rts_count"] = grid_gdf.cell_id.map(counts)
+
+            areas = subset.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
+            grid_gdf[f"darts_{year}_rts_area"] = grid_gdf.cell_id.map(areas)
+
+            areas_cov = subset_cov.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
+            grid_gdf[f"darts_{year}_covered_area"] = grid_gdf.cell_id.map(areas_cov)
+            grid_gdf[f"darts_{year}_coverage"] = grid_gdf[f"darts_{year}_covered_area"] / grid_gdf.geometry.area
+
+            grid_gdf[f"darts_{year}_rts_density"] = (
+                grid_gdf[f"darts_{year}_rts_area"] / grid_gdf[f"darts_{year}_covered_area"]
+            )
+
+    output_path = get_darts_rts_file(grid, level)
+    grid_gdf.to_parquet(output_path)
+    print(f"Saved RTS labels to {output_path}")
+    stopwatch.summary()
+
+
+def main():  # noqa: D103
+    cyclopts.run(extract_darts_rts)
+
+
+if __name__ == "__main__":
+    main()
--- a/steps/s1_1_era5/era5.py
+++ b/steps/s1_1_era5/era5.py
@ -1,3 +1,4 @@
+# ruff: noqa: PD011
 """Download and preprocess ERA5 data.

 Variables of Interest:
@ -9,6 +10,12 @@ Variables of Interest:
 - Surface sensible heat flux (sshf) [accum]
 - Lake ice bottom temperature (lblt) [instant]

+Snow Fall and Total precipitation are both further accumulated over a day -
+thus instead of taking the daily sum, only the last value of the day is taken.
+This is 00:00 of the next day, e.g.:
+tp_2020_06_23 <- tp_2020_06_24_00:00
+See: https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation#ERA5Land:datadocumentation-accumulationsAccumulations
+
 Naming patterns:
 - Instant Variables are downloaded already as statistically aggregated (lossy),
  therefore their names get the aggregation as suffix
@ -66,14 +73,12 @@ Author: Tobias Hölzer
 Date: June to October 2025
 """

-import os
+import cProfile
 import time
-from pathlib import Path
 from typing import Literal

 import cyclopts
 import dask.distributed as dd
-import geopandas as gpd
 import numpy as np
 import odc.geo
 import odc.geo.xr
@ -84,32 +89,16 @@ import xarray as xr
 from rich import pretty, print, traceback
 from rich.progress import track
 from shapely.geometry import LineString, Polygon
-from zarr.codecs import BloscCodec
+from stopuhr import stopwatch

-traceback.install(show_locals=True, suppress=[cyclopts, xr, pd])
+from entropice import codecs, grids, watermask
+from entropice.paths import get_era5_stores
+
+traceback.install(show_locals=True, suppress=[cyclopts, xr, pd, cProfile])
 pretty.install()

 cli = cyclopts.App()

-DATA_DIR = Path(os.environ.get("DATA_DIR", "data")) / "entropyc-rts"
-ERA5_DIR = DATA_DIR / "era5"
-DAILY_RAW_PATH = ERA5_DIR / "daily_raw.zarr"
-DAILY_ENRICHED_PATH = ERA5_DIR / "daily_enriched.zarr"
-MONTHLY_RAW_PATH = ERA5_DIR / "monthly_raw.zarr"
-YEARLY_RAW_PATH = ERA5_DIR / "yearly_aligned.zarr"
-SUMMER_RAW_PATH = ERA5_DIR / "summer_aligned.zarr"
-WINTER_RAW_PATH = ERA5_DIR / "winter_aligned.zarr"
-
-
-def _get_grid_paths(
-    agg: Literal["daily", "monthly", "summer", "winter", "yearly"],
-    grid: Literal["hex", "healpix"],
-    level: int,
-):
-    gridname = f"permafrost_{grid}{level}"
-    aligned_path = ERA5_DIR / f"{agg}_{gridname}.zarr"
-    return aligned_path
-

 min_lat = 50
 max_lat = 83.7  # Ensures the right Chunks Size (90 - 64 / 10 + 0.1)
@ -145,24 +134,6 @@ accums = {
 }


-def create_encoding(ds: xr.Dataset):
-    """Create compression encoding for zarr dataset storage.
-
-    Creates Blosc compression configuration for all data variables and coordinates
-    in the dataset using zstd compression with level 5.
-
-    Args:
-        ds (xr.Dataset): The xarray Dataset to create encoding for.
-
-    Returns:
-        dict: Encoding dictionary with compression settings for each variable.
-
-    """
-    # encoding = {var: {"compressors": BloscCodec(cname="zlib", clevel=9)} for var in ds.data_vars}
-    encoding = {var: {"compressors": BloscCodec(cname="zstd", clevel=5)} for var in [*ds.data_vars, *ds.coords]}
-    return encoding
-
-
 # ================
 # === Download ===
 # ================
@ -208,16 +179,18 @@ def download_daily_aggregated():
    daily_raw = xr.merge(
        [
            # Instant
-            era5.t2m.resample(time="1D").max().rename("t2m_max"),
-            era5.t2m.resample(time="1D").min().rename("t2m_min"),
-            era5.t2m.resample(time="1D").mean().rename("t2m_mean"),
-            era5.snowc.resample(time="1D").mean().rename("snowc_mean"),
-            era5.sde.resample(time="1D").mean().rename("sde_mean"),
-            era5.lblt.resample(time="1D").max().rename("lblt_max"),
+            era5.t2m.resample(time="1D").max().rename("t2m_max").astype(np.float32),
+            era5.t2m.resample(time="1D").min().rename("t2m_min").astype(np.float32),
+            era5.t2m.resample(time="1D").mean().rename("t2m_mean").astype(np.float32),
+            era5.snowc.resample(time="1D").mean().rename("snowc_mean").astype(np.float32),
+            era5.sde.resample(time="1D").mean().rename("sde_mean").astype(np.float32),
+            era5.lblt.resample(time="1D").max().rename("lblt_max").astype(np.float32),
            # Accum
-            era5.tp.resample(time="1D").sum().rename("tp"),
-            era5.sf.resample(time="1D").sum().rename("sf"),
-            era5.sshf.resample(time="1D").sum().rename("sshf"),
+            era5.sshf.resample(time="1D").sum(skipna=False).rename("sshf").astype(np.float32),
+            # Precipitation and snow fall are special
+            # Take only the last value of the day (00:00 of next day)
+            era5.tp.resample(time="1D").first().shift(time=-1).rename("tp").astype(np.float32),
+            era5.sf.resample(time="1D").first().shift(time=-1).rename("sf").astype(np.float32),
        ]
    )

@ -225,16 +198,18 @@ def download_daily_aggregated():
    daily_raw["t2m_max"].attrs = {"long_name": "Daily maximum 2 metre temperature", "units": "K"}
    daily_raw["t2m_min"].attrs = {"long_name": "Daily minimum 2 metre temperature", "units": "K"}
    daily_raw["t2m_mean"].attrs = {"long_name": "Daily mean 2 metre temperature", "units": "K"}
-    daily_raw["tp"].attrs = {"long_name": "Daily total precipitation", "units": "m"}
-    daily_raw["sf"].attrs = {"long_name": "Daily total snow fall", "units": "m"}
    daily_raw["snowc_mean"].attrs = {"long_name": "Daily mean snow cover", "units": "m"}
    daily_raw["sde_mean"].attrs = {"long_name": "Daily mean snow depth", "units": "m"}
-    daily_raw["sshf"].attrs = {"long_name": "Daily total surface sensible heat flux", "units": "J/m²"}
    daily_raw["lblt_max"].attrs = {"long_name": "Daily maximum lake ice bottom temperature", "units": "K"}
+    daily_raw["tp"].attrs = {"long_name": "Daily total precipitation", "units": "m"}  # Units are rather m^3 / m^2
+    daily_raw["sf"].attrs = {"long_name": "Daily total snow fall", "units": "m"}  # Units are rather m^3 / m^2
+    daily_raw["sshf"].attrs = {"long_name": "Daily total surface sensible heat flux", "units": "J/m²"}

    daily_raw = daily_raw.odc.assign_crs("epsg:4326")
    daily_raw = daily_raw.drop_vars(["surface", "number", "depthBelowLandLayer"])
-    daily_raw.to_zarr(DAILY_RAW_PATH, mode="w", encoding=create_encoding(daily_raw), consolidated=False)
+    daily_store = get_era5_stores("daily")
+    print(f"Saving downloaded and daily aggregated ERA5 data to {daily_store}.")
+    daily_raw.to_zarr(daily_store, mode="w", encoding=codecs.from_ds(daily_raw), consolidated=False)


@cli.command
@ -252,7 +227,7 @@ def download():
        print(client)
        print(client.dashboard_link)
        download_daily_aggregated()
-        print(f"Downloaded and aggregated ERA5 data to {DAILY_RAW_PATH.resolve()}.")
+        print("Downloaded and aggregated ERA5 data.")


 # ============================
@ -275,36 +250,62 @@ def daily_enrich():
    - Snow isolation index

    """
-    daily = xr.open_zarr(DAILY_RAW_PATH, consolidated=False).set_coords("spatial_ref")
+    daily_store = get_era5_stores("daily")
+    daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
    assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"

+    # For better dask performance, all variables are written immediately after calculation
+    # The smart scheduling which could performantly handle more is according to docs not yet implemented.
+    # See https://docs.xarray.dev/en/stable/user-guide/dask.html#best-practices point 3 or https://github.com/dask/dask/issues/874
+    def _store(v: str):
+        nonlocal daily
+
+        encoding = codecs.from_ds(daily[[v]], include_coords=False)
+        print(f"Storing enriched daily variable {v} to {daily_store}...")
+        with stopwatch("Storing enriched daily variable"):
+            daily[[v]].to_zarr(daily_store, mode="a", encoding=encoding, consolidated=False)
+        daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
+
    # Formulas based on Groeke et. al. (2025) Stochastic Weather generation...
    daily["t2m_avg"] = (daily.t2m_max + daily.t2m_min) / 2
    daily.t2m_avg.attrs = {"long_name": "Daily average 2 metre temperature", "units": "K"}
+    _store("t2m_avg")
+
    daily["t2m_range"] = daily.t2m_max - daily.t2m_min
    daily.t2m_range.attrs = {"long_name": "Daily range of 2 metre temperature", "units": "K"}
+    _store("t2m_range")
+
    daily["t2m_skew"] = (daily.t2m_mean - daily.t2m_min) / daily.t2m_range
    daily.t2m_skew.attrs = {"long_name": "Daily skewness of 2 metre temperature"}
+    _store("t2m_skew")

    daily["thawing_degree_days"] = (daily.t2m_avg - 273.15).clip(min=0)
    daily.thawing_degree_days.attrs = {"long_name": "Thawing degree days", "units": "K"}
+    _store("thawing_degree_days")
+
    daily["freezing_degree_days"] = (273.15 - daily.t2m_avg).clip(min=0)
    daily.freezing_degree_days.attrs = {"long_name": "Freezing degree days", "units": "K"}
+    _store("freezing_degree_days")

    daily["thawing_days"] = (daily.t2m_avg > 273.15).astype(int)
    daily.thawing_days.attrs = {"long_name": "Thawing days"}
+    _store("thawing_days")
+
    daily["freezing_days"] = (daily.t2m_avg < 273.15).astype(int)
    daily.freezing_days.attrs = {"long_name": "Freezing days"}
+    _store("freezing_days")

-    daily["precipitation_occurrences"] = (daily.tp > 0).astype(int)
+    daily["precipitation_occurrences"] = (daily.tp > 0.001).astype(int)
    daily.precipitation_occurrences.attrs = {"long_name": "Precipitation occurrences"}
-    daily["snowfall_occurrences"] = (daily.sf > 0).astype(int)
+    _store("precipitation_occurrences")
+
+    daily["snowfall_occurrences"] = (daily.sf > 0.001).astype(int)
    daily.snowfall_occurrences.attrs = {"long_name": "Snowfall occurrences"}
+    _store("snowfall_occurrences")

    daily["naive_snow_isolation"] = daily.snowc_mean * daily.sde_mean
    daily.naive_snow_isolation.attrs = {"long_name": "Naive snow isolation"}
-
-    daily.to_zarr(DAILY_ENRICHED_PATH, mode="w", encoding=create_encoding(daily), consolidated=False)
+    _store("naive_snow_isolation")


 def monthly_aggregate():
@ -318,7 +319,8 @@ def monthly_aggregate():
    The aggregated monthly data is saved to a zarr file for further processing.

    """
-    daily = xr.open_zarr(DAILY_ENRICHED_PATH, consolidated=False)
+    daily_store = get_era5_stores("daily")
+    daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
    assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
    daily = daily.sel(time=slice(min_time, max_time))

@ -348,8 +350,10 @@ def monthly_aggregate():
        monthly_accums.append(agg)

    monthly = xr.merge(monthly_instants + monthly_accums)
-    monthly = monthly.chunk({"time": len(monthly.time), "latitude": 64, "longitude": 64})
-    monthly.to_zarr(MONTHLY_RAW_PATH, mode="w", encoding=create_encoding(monthly), consolidated=False)
+    monthly = monthly.chunk({"time": len(monthly.time), "latitude": 256, "longitude": 256})  # ~ 100Mb / chunk for f32
+    monthly_store = get_era5_stores("monthly")
+    print(f"Saving monthly aggregated ERA5 data to {monthly_store}.")
+    monthly.to_zarr(monthly_store, mode="w", encoding=codecs.from_ds(monthly), consolidated=False)


 def multi_monthly_aggregate(monthly: xr.Dataset, n: int = 12) -> xr.Dataset:
@ -407,47 +411,12 @@ def multi_monthly_aggregate(monthly: xr.Dataset, n: int = 12) -> xr.Dataset:
        "link": "https://tc.copernicus.org/articles/11/989/2017/tc-11-989-2017.pdf",
    }

-    multimonthly = multimonthly.chunk({"time": len(multimonthly.time), "latitude": 64, "longitude": 64})
+    multimonthly = multimonthly.chunk(
+        {"time": len(multimonthly.time), "latitude": 128, "longitude": 1024}
+    )  # ~36Mb / chunk for f64
    return multimonthly


-def yearly_and_seasonal_aggregate():
-    """Aggregate monthly ERA5 data to yearly resolution with seasonal splits.
-
-    Takes monthly aggregated data and creates yearly aggregates using a shifted
-    calendar (October to September) to better capture Arctic seasonal patterns.
-    Creates separate aggregates for full year, winter (Oct-Apr), and summer
-    (May-Sep) periods.
-
-    The first and last incomplete years are excluded from the analysis.
-    Winter months are defined as months 1-7 in the shifted calendar,
-    and summer months are 8-12.
-
-    The final dataset includes yearly, winter, and summer aggregates for all
-    climate variables, saved to a zarr file.
-
-    """
-    monthly = xr.open_zarr(MONTHLY_RAW_PATH, consolidated=False).set_coords("spatial_ref")
-    assert "time" in monthly.dims, f"Expected dim 'time' to be in {monthly.dims=}"
-
-    # "Shift" the calendar by slicing the first Jan-Sep and the last Oct-Dec months
-    first_year = monthly.time.dt.year.min().item()
-    last_year = monthly.time.dt.year.max().item()
-    monthly = monthly.sel(time=slice(f"{first_year}-10-01", f"{last_year}-09-30"))
-
-    yearly = multi_monthly_aggregate(monthly, n=12)
-    yearly = derive_yearly_variables(yearly)
-    yearly.to_zarr(YEARLY_RAW_PATH, mode="w", encoding=create_encoding(yearly), consolidated=False)
-
-    summer_winter = multi_monthly_aggregate(monthly, n=6)
-
-    summer = summer_winter.sel(time=summer_winter.time.dt.month == 4)
-    summer.to_zarr(SUMMER_RAW_PATH, mode="w", encoding=create_encoding(summer), consolidated=False)
-
-    winter = summer_winter.sel(time=summer_winter.time.dt.month == 10)
-    winter.to_zarr(WINTER_RAW_PATH, mode="w", encoding=create_encoding(winter), consolidated=False)
-
-
 def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
    """Derive additional variables from daily data and add them to the yearly dataset.

@ -459,7 +428,8 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:

    """
    assert "time" in yearly.dims, f"Expected dim 'time' to be in {yearly.dims=}"
-    daily = xr.open_zarr(DAILY_ENRICHED_PATH, consolidated=False).set_coords("spatial_ref")
+    daily_store = get_era5_stores("daily")
+    daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
    assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
    daily = daily.sel(time=slice(min_time, max_time))
    # ? Note: The functions do not really account for leap years
@ -476,7 +446,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
    # This assumes that the 01-01 is almost everywhere one of the coldest days in the year
    first_thaw_day = daily.thawing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
    first_thaw_day = first_thaw_day.where(~never_thaws).rename("day_of_first_thaw").rename(year="time")
-    first_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_thaw_day.time.values])  # noqa: PD011
+    first_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_thaw_day.time.values])
    first_thaw_day.attrs = {"long_name": "Day of first thaw in year", "units": "day of year"}
    yearly["day_of_first_thaw"] = first_thaw_day.sel(time=yearly.time)

@ -484,7 +454,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
        n_days_in_year - daily.thawing_days[::-1].groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
    )
    last_thaw_day = last_thaw_day.where(~never_thaws).rename("day_of_last_thaw").rename(year="time")
-    last_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_thaw_day.time.values])  # noqa: PD011
+    last_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_thaw_day.time.values])
    last_thaw_day.attrs = {"long_name": "Day of last thaw in year", "units": "day of year"}
    yearly["day_of_last_thaw"] = last_thaw_day.sel(time=yearly.time)

@ -506,7 +476,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:

    first_freezing_day = daily_shifted.freezing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
    first_freezing_day = first_freezing_day.where(~never_freezes).rename("day_of_first_freeze").rename(year="time")
-    first_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_freezing_day.time.values])  # noqa: PD011
+    first_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_freezing_day.time.values])
    first_freezing_day.attrs = {"long_name": "Day of first freeze in year", "units": "day of year"}
    yearly["day_of_first_freeze"] = first_freezing_day.sel(time=yearly.time)

@ -516,7 +486,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
        + 1
    )
    last_freezing_day = last_freezing_day.where(~never_freezes).rename("day_of_last_freeze").rename(year="time")
-    last_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_freezing_day.time.values])  # noqa: PD011
+    last_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_freezing_day.time.values])
    last_freezing_day.attrs = {"long_name": "Day of last freeze in year", "units": "day of year"}
    yearly["day_of_last_freeze"] = last_freezing_day.sel(time=yearly.time)

@ -528,6 +498,53 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
    return yearly


+def yearly_and_seasonal_aggregate():
+    """Aggregate monthly ERA5 data to yearly resolution with seasonal splits.
+
+    Takes monthly aggregated data and creates yearly aggregates using a shifted
+    calendar (October to September) to better capture Arctic seasonal patterns.
+    Creates separate aggregates for full year, winter (Oct-Apr), and summer
+    (May-Sep) periods.
+
+    The first and last incomplete years are excluded from the analysis.
+    Winter months are defined as months 1-7 in the shifted calendar,
+    and summer months are 8-12.
+
+    The final dataset includes yearly, winter, and summer aggregates for all
+    climate variables, saved to a zarr file.
+
+    """
+    monthly_store = get_era5_stores("monthly")
+    monthly = xr.open_zarr(monthly_store, consolidated=False).set_coords("spatial_ref")
+    assert "time" in monthly.dims, f"Expected dim 'time' to be in {monthly.dims=}"
+
+    # "Shift" the calendar by slicing the first Jan-Sep and the last Oct-Dec months
+    first_year = monthly.time.dt.year.min().item()
+    last_year = monthly.time.dt.year.max().item()
+    monthly = monthly.sel(time=slice(f"{first_year}-10-01", f"{last_year}-09-30"))
+
+    yearly = multi_monthly_aggregate(monthly, n=12)
+    yearly = derive_yearly_variables(yearly)
+    yearly = yearly.chunk({"time": len(yearly.time), "latitude": 256, "longitude": 1024})  # ~36Mb / chunk for f32
+    yearly_store = get_era5_stores("yearly")
+    print(f"Saving yearly aggregated ERA5 data to {yearly_store}.")
+    yearly.to_zarr(yearly_store, mode="w", encoding=codecs.from_ds(yearly), consolidated=False)
+
+    summer_winter = multi_monthly_aggregate(monthly, n=6)
+
+    summer = summer_winter.sel(time=summer_winter.time.dt.month == 4)
+    summer = summer.chunk({"time": len(summer.time), "latitude": 256, "longitude": 1024})  # ~36Mb / chunk for f32
+    summer_store = get_era5_stores("summer")
+    print(f"Saving summer aggregated ERA5 data to {summer_store}.")
+    summer.to_zarr(summer_store, mode="w", encoding=codecs.from_ds(summer), consolidated=False)
+
+    winter = summer_winter.sel(time=summer_winter.time.dt.month == 10)
+    winter = winter.chunk({"time": len(winter.time), "latitude": 256, "longitude": 1024})  # ~36Mb / chunk for f32
+    winter_store = get_era5_stores("winter")
+    print(f"Saving winter aggregated ERA5 data to {winter_store}.")
+    winter.to_zarr(winter_store, mode="w", encoding=codecs.from_ds(winter), consolidated=False)
+
+
@cli.command
 def enrich(n_workers: int = 10, monthly: bool = True, yearly: bool = True, daily: bool = True):
    """Enrich data and pPerform temporal aggregation of ERA5 data using Dask cluster.
@ -591,15 +608,16 @@ def _check_geom(geobox: odc.geo.geobox.GeoBox, geom: odc.geo.Geometry) -> bool:
    return (roix.stop - roix.start) > 1 and (roiy.stop - roiy.start) > 1


-def extract_cell_data(yearly: xr.Dataset, geom: Polygon):
-    """Extract ERA5 data for a specific grid cell geometry.
-
-    Extracts and spatially averages ERA5 data within the bounds of a grid cell.
-    Handles antimeridian-crossing cells by splitting them appropriately.
+@stopwatch("Getting corrected geometries", log=False)
+def _get_corrected_geoms(geom: Polygon, gbox: odc.geo.geobox.GeoBox) -> list[odc.geo.Geometry]:
+    """Get corrected geometries for antimeridian-crossing polygons.

    Args:
-        yearly (xr.Dataset): Yearly aggregated ERA5 dataset.
-        geom (Polygon): Polygon geometry of the grid cell.
+        geom (Polygon): Input polygon geometry.
+        gbox (odc.geo.geobox.GeoBox): GeoBox for spatial reference.
+
+    Returns:
+        list[odc.geo.Geometry]: List of corrected, georeferenced geometries.

    """
    # cell.geometry is a shapely Polygon
@ -608,28 +626,73 @@ def extract_cell_data(yearly: xr.Dataset, geom: Polygon):
    # Split geometry in case it crossed antimeridian
    else:
        geoms = _split_antimeridian_cell(geom)
-    cell_data = []
-    for geom in geoms:
-        geom = odc.geo.Geometry(geom, crs="epsg:4326")
-        if not _check_geom(yearly.odc.geobox, geom):
-            continue
-        cell_data.append(yearly.odc.crop(geom).drop_vars("spatial_ref").mean(["latitude", "longitude"]))
-    if len(cell_data) == 0:
+
+    geoms = [odc.geo.Geometry(g, crs="epsg:4326") for g in geoms]
+    geoms = filter(lambda g: _check_geom(gbox, g), geoms)
+    return geoms
+
+
+@stopwatch("Extracting cell data", log=False)
+def extract_cell_data(ds: xr.Dataset, geoms: list[odc.geo.Geometry]) -> xr.Dataset | bool:
+    """Extract ERA5 data for a specific grid cell geometry.
+
+    Extracts and spatially averages ERA5 data within the bounds of a grid cell.
+    Handles antimeridian-crossing cells by splitting them appropriately.
+
+    Args:
+        ds (xr.Dataset): An ERA5 dataset.
+        geoms (list[odc.geo.Geometry]): List of (valid) geometries of the grid cell.
+
+    """
+    if len(geoms) == 0:
        return False
-    elif len(cell_data) == 1:
-        cell_data = cell_data[0]
+    elif len(geoms) == 1:
+        return ds.odc.crop(geoms[0]).drop_vars("spatial_ref").mean(["latitude", "longitude"], skipna=True).compute()
    else:
-        cell_data = xr.concat(cell_data, dim="part").mean("part")
-    cell_data = cell_data.compute()
-    return cell_data
+        parts = [
+            ds.odc.crop(geom).drop_vars("spatial_ref").mean(["latitude", "longitude"], skipna=True) for geom in geoms
+        ]
+        parts = [part for part in parts if part.latitude.size > 0 and part.longitude.size > 0]
+        if len(parts) == 0:
+            raise ValueError("No valid parts found for geometry. This should not happen!")
+        elif len(parts) == 1:
+            return parts[0].compute()
+        else:
+            return xr.concat(parts, dim="part").mean("part", skipna=True).compute()
+
+
+def _correct_longs(ds: xr.Dataset) -> xr.Dataset:
+    return ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180)).sortby("longitude")
+
+
+@stopwatch("Creating aligned dataset", log=False)
+def _create_aligned(
+    ds: xr.Dataset, data: dict[str, np.ndarray], grid: Literal["hex", "healpix"], level: int
+) -> xr.Dataset:
+    cell_ids = grids.get_cell_ids(grid, level)
+    data_vars = {var: (["cell_ids", "time"], values) for var, values in data.items()}
+    aligned = xr.Dataset(
+        data_vars,
+        coords={"cell_ids": cell_ids, "time": ds.time},
+    )
+    gridinfo = {
+        "grid_name": "h3" if grid == "hex" else grid,
+        "level": level,
+    }
+    if grid == "healpix":
+        gridinfo["indexing_scheme"] = "nested"
+    aligned.cell_ids.attrs = gridinfo
+    for var in ds.data_vars:
+        aligned[var].attrs = ds[var].attrs
+
+    aligned = aligned.chunk({"cell_ids": min(len(aligned.cell_ids), 10000), "time": len(aligned.time)})
+    return aligned


@cli.command
 def spatial_agg(
    grid: Literal["hex", "healpix"],
    level: int,
-    agg: Literal["summer", "winter", "yearly"] = "yearly",
-    n_workers: int = 10,
 ):
    """Perform spatial aggregation of ERA5 data to grid cells.

@ -640,69 +703,73 @@ def spatial_agg(
    Args:
        grid ("hex" | "healpix"): Grid type.
        level (int): Grid resolution level.
-        agg ("summer" | "winter" | "yearly"): Type of aggregation to perform. Defaults to yearly.
-        n_workers (int, optional): Number of parallel workers to use. Defaults to 10.

    """
-    gridname = f"permafrost_{grid}{level}"
-    agg_grid_path = _get_grid_paths(agg, grid, level)
-    grid_df = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")
-    # Create an empty zarr array with the right dimensions
-    if agg == "summer":
-        agg_data_path = SUMMER_RAW_PATH
-    elif agg == "winter":
-        agg_data_path = WINTER_RAW_PATH
-    elif agg == "yearly":
-        agg_data_path = YEARLY_RAW_PATH
-    else:
-        raise ValueError(f"Unknown aggregation type: {agg}")
-    agg_raw = (
-        xr.open_zarr(agg_data_path, consolidated=False, decode_timedelta=False)
-        .set_coords("spatial_ref")
-        .drop_vars(["surface", "number", "depthBelowLandLayer"])
-        .load()
-    )
-    assert {"latitude", "longitude", "time"} == set(agg_raw.dims), (
-        f"Expected dims ('latitude', 'longitude', 'time'), got {agg_raw.dims}"
-    )
-    assert agg_raw.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {agg_raw.odc.crs}"
+    grid_gdf = grids.open(grid, level)
+    # ? Mask out water, since we don't want to aggregate over oceans
+    grid_gdf = watermask.clip_grid(grid_gdf)

-    # Convert lons to -180 to 180 instead of 0 to 360
-    agg_raw = agg_raw.assign_coords(longitude=(((agg_raw.longitude + 180) % 360) - 180)).sortby("longitude")
+    summer_unaligned_store = get_era5_stores("summer")
+    winter_unaligned_store = get_era5_stores("winter")
+    yearly_unaligned_store = get_era5_stores("yearly")
+    with stopwatch("Loading summer ERA5 data"):
+        summer_unaligned = xr.open_zarr(summer_unaligned_store, consolidated=False).set_coords("spatial_ref")
+        assert {"latitude", "longitude", "time"} == set(summer_unaligned.dims)
+        assert summer_unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {summer_unaligned.odc.crs}"
+        summer_unaligned = _correct_longs(summer_unaligned)
+    with stopwatch("Loading winter ERA5 data"):
+        winter_unaligned = xr.open_zarr(winter_unaligned_store, consolidated=False).set_coords("spatial_ref")
+        assert {"latitude", "longitude", "time"} == set(winter_unaligned.dims)
+        assert winter_unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {winter_unaligned.odc.crs}"
+        winter_unaligned = _correct_longs(winter_unaligned)
+    with stopwatch("Loading yearly ERA5 data"):
+        yearly_unaligned = xr.open_zarr(yearly_unaligned_store, consolidated=False).set_coords("spatial_ref")
+        assert {"latitude", "longitude", "time"} == set(yearly_unaligned.dims)
+        assert yearly_unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {yearly_unaligned.odc.crs}"
+        yearly_unaligned = _correct_longs(yearly_unaligned)

-    # ? Converting cell IDs from hex strings to integers for xdggs compatibility
-    cells = [int(cid, 16) for cid in grid_df.cell_id.to_list()]
-
-    agg_aligned = (
-        xr.zeros_like(agg_raw.isel(latitude=0, longitude=0).drop_vars(["latitude", "longitude"]))
-        .expand_dims({"cell_ids": cells})
-        .chunk({"cell_ids": min(len(grid_df), 10000), "time": len(agg_raw.time)})
-    )
-
-    agg_aligned.cell_ids.attrs = {
-        "grid_name": "h3" if grid == "hex" else grid,
-        "level": level,
+    summer_data = {
+        var: np.full((len(grid_gdf), len(summer_unaligned.time)), np.nan, dtype=np.float32)
+        for var in summer_unaligned.data_vars
    }
-    if grid == "healpix":
-        agg_aligned.cell_ids.attrs["indexing_scheme"] = "nested"
-
-    from stopuhr import stopwatch
-
-    for _, row in track(
-        grid_df.to_crs("epsg:4326").iterrows(),
-        total=len(grid_df),
+    winter_data = {
+        var: np.full((len(grid_gdf), len(winter_unaligned.time)), np.nan, dtype=np.float32)
+        for var in winter_unaligned.data_vars
+    }
+    yearly_data = {
+        var: np.full((len(grid_gdf), len(yearly_unaligned.time)), np.nan, dtype=np.float32)
+        for var in yearly_unaligned.data_vars
+    }
+    for i, (_, row) in track(
+        enumerate(grid_gdf.to_crs("epsg:4326").iterrows()),
+        total=len(grid_gdf),
        description="Spatially aggregating ERA5 data...",
    ):
-        cell_id = int(row.cell_id, 16)
-        with stopwatch("Extracting cell data", log=False):
-            cell_data = extract_cell_data(agg_raw, row.geometry)
-        if cell_data is False:
-            print(f"Warning: No data found for cell {cell_id}, skipping.")
+        geoms = _get_corrected_geoms(row.geometry, summer_unaligned.odc.geobox)
+        if len(geoms) == 0:
+            print(f"Warning: No valid geometry for cell {row.cell_id}.")
            continue
-        with stopwatch("Assigning cell data", log=False):
-            agg_aligned.loc[{"cell_ids": cell_id}] = cell_data

-    agg_aligned.to_zarr(agg_grid_path, mode="w", consolidated=False, encoding=create_encoding(agg_aligned))
+        cell_data = extract_cell_data(summer_unaligned, geoms)
+        for var in summer_unaligned.data_vars:
+            summer_data[var][i, :] = cell_data[var].values.astype()
+        cell_data = extract_cell_data(winter_unaligned, geoms)
+        for var in winter_unaligned.data_vars:
+            winter_data[var][i, :] = cell_data[var].values
+        cell_data = extract_cell_data(yearly_unaligned, geoms)
+        for var in yearly_unaligned.data_vars:
+            yearly_data[var][i, :] = cell_data[var].values
+
+    summer = _create_aligned(summer_unaligned, summer_data, grid, level)
+    winter = _create_aligned(winter_unaligned, winter_data, grid, level)
+    yearly = _create_aligned(yearly_unaligned, yearly_data, grid, level)
+
+    summer_store = get_era5_stores("summer", grid, level)
+    winter_store = get_era5_stores("winter", grid, level)
+    yearly_store = get_era5_stores("yearly", grid, level)
+    summer.to_zarr(summer_store, mode="w", consolidated=False, encoding=codecs.from_ds(summer))
+    winter.to_zarr(winter_store, mode="w", consolidated=False, encoding=codecs.from_ds(winter))
+    yearly.to_zarr(yearly_store, mode="w", consolidated=False, encoding=codecs.from_ds(yearly))
    print("Finished spatial matching.")
    stopwatch.summary()

--- a/steps/s0_0_grids/create_grid.py
+++ b/steps/s0_0_grids/create_grid.py
@ -4,8 +4,6 @@ Author: Tobias Hölzer
 Date: 09. June 2025
 """

-import os
-from pathlib import Path
 from typing import Literal

 import cartopy.crs as ccrs
@ -25,14 +23,45 @@ from shapely.ops import transform
 from stopuhr import stopwatch
 from xdggs.healpix import HealpixInfo

+from entropice.paths import get_grid_file, get_grid_viz_file, watermask_file
+
 traceback.install()
 pretty.install()

-DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
-GRIDS_DIR = DATA_DIR / "grids"
-FIGURES_DIR = DATA_DIR / "figures"
-GRIDS_DIR.mkdir(parents=True, exist_ok=True)
-FIGURES_DIR.mkdir(parents=True, exist_ok=True)
+
+def open(grid: Literal["hex", "healpix"], level: int):
+    """Open a saved grid from parquet file.
+
+    Args:
+        grid (Literal["hex", "healpix"]): The grid type to use.
+        level (int): The grid level to use.
+
+    Returns:
+        GeoDataFrame: The loaded grid.
+
+    """
+    gridfile = get_grid_file(grid, level)
+    grid = gpd.read_parquet(gridfile)
+    return grid
+
+
+def get_cell_ids(grid: Literal["hex", "healpix"], level: int):
+    """Get the cell IDs of a saved grid.
+
+    Args:
+        grid (Literal["hex", "healpix"]): The grid type to use.
+        level (int): The grid level to use.
+
+    Returns:
+        List[str]: The list of cell IDs.
+
+    """
+    grid_gdf = open(grid, level)
+    cell_ids = grid_gdf["cell_id"].tolist()
+    if grid == "hex":
+        # ? Converting cell IDs from hex strings to integers for xdggs compatibility
+        cell_ids = [int(cid, 16) for cid in cell_ids]
+    return cell_ids


@stopwatch("Create a global hex grid")
@ -131,7 +160,7 @@ def filter_permafrost_grid(grid: gpd.GeoDataFrame):
    grid = grid.to_crs("EPSG:3413")

    # Filter out non-land areas (e.g., oceans)
-    water_mask = gpd.read_file(DATA_DIR / "simplified-water-polygons-split-3857/simplified_water_polygons.shp")
+    water_mask = gpd.read_file(watermask_file)
    water_mask = water_mask.to_crs("EPSG:3413")

    ov = gpd.overlay(grid, water_mask, how="intersection")
@ -226,14 +255,14 @@ def cli(grid: Literal["hex", "healpix"], level: int):
        print("No valid grid cells found.")
        return

-    grid_file = GRIDS_DIR / f"permafrost_{grid}{level}_grid.parquet"
+    grid_file = get_grid_file(grid, level)
    grid_gdf.to_parquet(grid_file)
-    print(f"Saved to {grid_file.resolve()}")
+    print(f"Saved to {grid_file}")

    fig = vizualize_grid(grid_gdf, grid, level)
-    fig_file = FIGURES_DIR / f"permafrost_{grid}{level}_grid.png"
+    fig_file = get_grid_viz_file(grid, level)
    fig.savefig(fig_file, dpi=300)
-    print(f"Saved figure to {fig_file.resolve()}")
+    print(f"Saved figure to {fig_file}")
    plt.close(fig)


--- a/src/entropice/paths.py
+++ b/src/entropice/paths.py
@ -0,0 +1,75 @@
+# ruff: noqa: D103
+"""Paths for entropice data storage."""
+
+import os
+from pathlib import Path
+from typing import Literal
+
+DATA_DIR = Path(os.environ.get("DATA_DIR", "data")).resolve() / "entropice"
+
+GRIDS_DIR = DATA_DIR / "grids"
+FIGURES_DIR = DATA_DIR / "figures"
+DARTS_DIR = DATA_DIR / "darts"
+ERA5_DIR = DATA_DIR / "era5"
+EMBEDDINGS_DIR = DATA_DIR / "embeddings"
+WATERMASK_DIR = DATA_DIR / "watermask"
+
+GRIDS_DIR.mkdir(parents=True, exist_ok=True)
+FIGURES_DIR.mkdir(parents=True, exist_ok=True)
+DARTS_DIR.mkdir(parents=True, exist_ok=True)
+ERA5_DIR.mkdir(parents=True, exist_ok=True)
+EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
+WATERMASK_DIR.mkdir(parents=True, exist_ok=True)
+
+
+watermask_file = DATA_DIR / "simplified_water_polygons.shp"
+
+dartsl2_file = DARTS_DIR / "DARTS_NitzeEtAl_v1-2_features_2018-2023_level2.parquet"
+dartsl2_cov_file = DARTS_DIR / "DARTS_NitzeEtAl_v1-2_coverage_2018-2023_level2.parquet"
+
+
+def _get_gridname(grid: Literal["hex", "healpix"], level: int) -> str:
+    return f"permafrost_{grid}{level}"
+
+
+def get_grid_file(grid: Literal["hex", "healpix"], level: int) -> Path:
+    gridname = _get_gridname(grid, level)
+    gridfile = GRIDS_DIR / f"{gridname}_grid.parquet"
+    return gridfile
+
+
+def get_grid_viz_file(grid: Literal["hex", "healpix"], level: int) -> Path:
+    gridname = _get_gridname(grid, level)
+    vizfile = FIGURES_DIR / f"{gridname}_grid.png"
+    return vizfile
+
+
+def get_darts_rts_file(grid: Literal["hex", "healpix"], level: int) -> Path:
+    gridname = _get_gridname(grid, level)
+    rtsfile = DARTS_DIR / f"{gridname}_rts.parquet"
+    return rtsfile
+
+
+def get_annual_embeddings_file(grid: Literal["hex", "healpix"], level: int, year: int) -> Path:
+    gridname = _get_gridname(grid, level)
+    embfile = EMBEDDINGS_DIR / f"{gridname}_embeddings-{year}.parquet"
+    return embfile
+
+
+def get_embeddings_store(grid: Literal["hex", "healpix"], level: int) -> Path:
+    gridname = _get_gridname(grid, level)
+    embstore = EMBEDDINGS_DIR / f"{gridname}_embeddings.zarr"
+    return embstore
+
+
+def get_era5_stores(
+    agg: Literal["daily", "monthly", "summer", "winter", "yearly"],
+    grid: Literal["hex", "healpix"] | None = None,
+    level: int | None = None,
+):
+    if grid is None or level is None:
+        return ERA5_DIR / f"{agg}_climate_unaligned.zarr"
+
+    gridname = _get_gridname(grid, level)
+    aligned_path = ERA5_DIR / f"{gridname}_{agg}_climate.zarr"
+    return aligned_path
--- a/src/entropice/watermask.py
+++ b/src/entropice/watermask.py
@ -0,0 +1,32 @@
+"""Helpers for the watermask."""
+
+import geopandas as gpd
+
+from entropice.paths import watermask_file
+
+
+def open():
+    """Open the watermask shapefile.
+
+    Returns:
+        GeoDataFrame: The watermask polygons.
+
+    """
+    watermask = gpd.read_file(watermask_file)
+    return watermask
+
+
+def clip_grid(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+    """Clip the input GeoDataFrame with the watermask.
+
+    Args:
+        gdf (gpd.GeoDataFrame): The input GeoDataFrame to clip.
+
+    Returns:
+        gpd.GeoDataFrame: The clipped GeoDataFrame.
+
+    """
+    watermask = open()
+    watermask = watermask.to_crs("EPSG:3413")
+    gdf = gdf.overlay(watermask, how="difference")
+    return gdf
--- a/steps/s0_1_rts/rts.py
+++ b/steps/s0_1_rts/rts.py
@ -1,66 +0,0 @@
-"""Labels of Retrogressive-Thaw-Slumps (RTS).
-
-Assumes that the level 1 and level 2 DARTS features have been downloaded into $DATA_DIR / entropyc-rts / darts: https://arcticdata.io/catalog/view/doi:10.18739/A22B8VD7C
-
-Author: Tobias Hölzer
-Date: October 2025
-"""
-
-import os
-from pathlib import Path
-from typing import Literal
-
-import cyclopts
-import geopandas as gpd
-from rich.progress import track
-
-DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
-LEVEL2_PATH = DATA_DIR / "darts" / "DARTS_NitzeEtAl_v1-2_features_2018-2023_level2.parquet"
-LEVEL2_COV_PATH = DATA_DIR / "darts" / "DARTS_NitzeEtAl_v1-2_coverage_2018-2023_level2.parquet"
-
-
-def extract_darts_rts(grid: Literal["hex", "healpix"], level: int):
-    """Extract RTS labels from DARTS dataset.
-
-    Args:
-        grid (Literal["hex", "healpix"]): The grid type to use.
-        level (int): The grid level to use.
-
-    """
-    darts_l2 = gpd.read_parquet(LEVEL2_PATH)
-    darts_cov_l2 = gpd.read_parquet(LEVEL2_COV_PATH)
-
-    gridname = f"permafrost_{grid}{level}"
-    grid = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")
-
-    grid_l2 = grid.overlay(darts_l2.to_crs(grid.crs), how="intersection")
-    grid_cov_l2 = grid.overlay(darts_cov_l2.to_crs(grid.crs), how="intersection")
-
-    years = list(grid_cov_l2["year"].unique())
-    for year in track(years, total=len(years), description="Processing years..."):
-        subset = grid_l2[grid_l2["year"] == year]
-        subset_cov = grid_cov_l2[grid_cov_l2["year"] == year]
-
-        counts = subset.groupby("cell_id").size()
-        grid[f"darts_{year}_rts_count"] = grid.cell_id.map(counts)
-
-        areas = subset.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
-        grid[f"darts_{year}_rts_area"] = grid.cell_id.map(areas)
-
-        areas_cov = subset_cov.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
-        grid[f"darts_{year}_covered_area"] = grid.cell_id.map(areas_cov)
-        grid[f"darts_{year}_coverage"] = grid[f"darts_{year}_covered_area"] / grid.geometry.area
-
-        grid[f"darts_{year}_rts_density"] = grid[f"darts_{year}_rts_area"] / grid[f"darts_{year}_covered_area"]
-
-    output_path = DATA_DIR / f"darts/{gridname}_darts.parquet"
-    grid.to_parquet(output_path)
-    print(f"Saved RTS labels to {output_path}")
-
-
-def main():  # noqa: D103
-    cyclopts.run(extract_darts_rts)
-
-
-if __name__ == "__main__":
-    main()
--- a/steps/s1_0_alphaearth/alphaearth.ipynb
+++ b/steps/s1_0_alphaearth/alphaearth.ipynb
--- a/uv.lock
+++ b/uv.lock
@ -1199,7 +1199,7 @@ wheels = [
 ]

 [[package]]
-name = "entropic-perma-risk"
+name = "entropice"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [