Refactor

2025-10-26 18:28:46 +01:00 · 2025-10-26 18:28:46 +01:00 · eeab8fff1e
commit eeab8fff1e
parent 3ad332b5a8
16 changed files with 536 additions and 943 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,5 +1,5 @@
 [project]
-name = "entropic-perma-risk"
+name = "entropice"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
@ -49,10 +49,14 @@ dependencies = [
 ]
 [project.scripts]
-create-grid = "steps.s0_0_grids.create_grid:main"
+create-grid = "entropice.grids:main"
-rts = "steps.s0_1_rts.rts:main"
+darts = "entropice.darts:main"
-alpha-earth = "steps.s1_0_alphaearth.alphaearth:main"
+alpha-earth = "entropice.alphaearth:main"
-era5 = "steps.s1_1_era5.era5:cli"
+era5 = "entropice.era5:cli"
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [tool.uv]
 package = true
--- a/steps/s1_0_alphaearth/alphaearth.sh
+++ b/steps/s1_0_alphaearth/alphaearth.sh
--- a/steps/s1_1_era5/cds.py
+++ b/steps/s1_1_era5/cds.py
--- a/steps/s1_1_era5/era5.sh
+++ b/steps/s1_1_era5/era5.sh
--- a/steps/s0_1_rts/rts.sh
+++ b/steps/s0_1_rts/rts.sh
--- a/src/entropice/init.py
+++ b/src/entropice/init.py
@ -0,0 +1,2 @@
 def hello() -> str:
    return "Hello from entropice!"
--- a/steps/s1_0_alphaearth/alphaearth.py
+++ b/steps/s1_0_alphaearth/alphaearth.py
@ -4,9 +4,7 @@ Author: Tobias Hölzer
 Date: October 2025
 """
 import os
 import warnings
 from pathlib import Path
 from typing import Literal
 import cyclopts
@ -16,9 +14,13 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 import xarray as xr
 import xdggs
 from rich import pretty, print, traceback
 from rich.progress import track
 from entropice import codecs, grids
 from entropice.paths import get_annual_embeddings_file, get_embeddings_store
 # Filter out the GeoDataFrame.swapaxes deprecation warning
 warnings.filterwarnings("ignore", message=".*GeoDataFrame.swapaxes.*", category=FutureWarning)
@ -26,26 +28,19 @@ pretty.install()
 traceback.install()
 ee.Initialize(project="ee-tobias-hoelzer")
 DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
 EMBEDDINGS_DIR = DATA_DIR / "embeddings"
 EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
 cli = cyclopts.App(name="alpha-earth")
@cli.command()
-def download(grid: Literal["hex", "healpix"], level: int, backup_intermediate: bool = False):
+def download(grid: Literal["hex", "healpix"], level: int):
    """Extract satellite embeddings from Google Earth Engine and map them to a grid.
    Args:
        grid (Literal["hex", "healpix"]): The grid type to use.
        level (int): The grid level to use.
        backup_intermediate (bool, optional): Whether to backup intermediate results. Defaults to False.
    """
-    gridname = f"permafrost_{grid}{level}"
+    grid_gdf = grids.open(grid, level)
    grid = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")
    for year in track(range(2017, 2025), total=8, description="Processing years..."):
        embedding_collection = ee.ImageCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL")
@ -72,9 +67,9 @@ def download(grid: Literal["hex", "healpix"], level: int, backup_intermediate: b
        # Process grid in batches of 100
        batch_size = 100
        all_results = []
-        n_batches = len(grid) // batch_size
+        n_batches = len(grid_gdf) // batch_size
        for batch_num, batch_grid in track(
-            enumerate(np.array_split(grid, n_batches)),
+            enumerate(np.array_split(grid_gdf, n_batches)),
            description="Processing batches...",
            total=n_batches,
        ):
@ -88,18 +83,12 @@ def download(grid: Literal["hex", "healpix"], level: int, backup_intermediate: b
            # Store batch results
            all_results.append(df_batch)
            # Save batch immediately to disk as backup
            if backup_intermediate:
                batch_filename = f"{gridname}_embeddings-{year}_batch{batch_num:06d}.parquet"
                batch_result = batch_grid.merge(df_batch[[*bands, "cell_id"]], on="cell_id", how="left")
                batch_result.to_parquet(EMBEDDINGS_DIR / f"{batch_filename}")
        # Combine all batch results
        df = pd.concat(all_results, ignore_index=True)
        embeddings_on_grid = grid.merge(df[[*bands, "cell_id"]], on="cell_id", how="left")
-        embeddings_file = EMBEDDINGS_DIR / f"{gridname}_embeddings-{year}.parquet"
+        embeddings_file = get_annual_embeddings_file(grid, level, year)
        embeddings_on_grid.to_parquet(embeddings_file)
-        print(f"Saved embeddings for year {year} to {embeddings_file.resolve()}.")
+        print(f"Saved embeddings for year {year} to {embeddings_file}.")
@cli.command()
@ -111,36 +100,38 @@ def combine_to_zarr(grid: Literal["hex", "healpix"], level: int):
        level (int): The grid level to use.
    """
-    embs = gpd.read_parquet(DATA_DIR / "embeddings" / f"permafrost_{grid}{level}_embeddings-2017.parquet")
+    cell_ids = grids.get_cell_ids(grid, level)
    # ? Converting cell IDs from hex strings to integers for xdggs compatibility
    cells = [int(cid, 16) for cid in embs.cell_id.to_list()]
    years = list(range(2017, 2025))
    aggs = ["median", "stdDev", "min", "max", "mean", "p1", "p5", "p25", "p75", "p95", "p99"]
    bands = [f"A{str(i).zfill(2)}" for i in range(64)]
    a = xr.DataArray(
        np.nan,
-        dims=("year", "cell", "band", "agg"),
+        dims=("year", "cell_ids", "band", "agg"),
-        coords={"year": years, "cell": cells, "band": bands, "agg": aggs},
+        coords={"year": years, "cell_ids": cell_ids, "band": bands, "agg": aggs},
-    )
+    ).astype(np.float32)
    # ? These attributes are needed for xdggs
-    a.cell.attrs = {
+    a.cell_ids.attrs = {
        "grid_name": "h3" if grid == "hex" else "healpix",
        "level": level,
    }
    if grid == "healpix":
-        a.cell.attrs["indexing_scheme"] = "nested"
+        a.cell_ids.attrs["indexing_scheme"] = "nested"
    for year in track(years, total=len(years), description="Processing years..."):
-        embs = gpd.read_parquet(DATA_DIR / "embeddings" / f"permafrost_{grid}{level}_embeddings-{year}.parquet")
+        embeddings_file = get_annual_embeddings_file(grid, level, year)
        embs = gpd.read_parquet(embeddings_file)
        for band in bands:
            for agg in aggs:
                col = f"{band}_{agg}"
                a.loc[{"band": band, "agg": agg, "year": year}] = embs[col].to_list()
-    zarr_path = EMBEDDINGS_DIR / f"permafrost_{grid}{level}_embeddings.zarr"
+    a = xdggs.decode(a)
-    a.to_zarr(zarr_path, consolidated=False, mode="w")
+
-    print(f"Saved combined embeddings to {zarr_path.resolve()}.")
+    zarr_path = get_embeddings_store(grid, level)
    a.to_zarr(zarr_path, consolidated=False, mode="w", encoding=codecs.from_ds(a))
    print(f"Saved combined embeddings to {zarr_path}.")
 def main():  # noqa: D103
--- a/src/entropice/codecs.py
+++ b/src/entropice/codecs.py
@ -0,0 +1,31 @@
 """Encoding utilities for zarr dataset storage."""
 import xarray as xr
 from zarr.codecs import BloscCodec
 def from_ds(ds: xr.Dataset, store_floats_as_float32: bool = True, include_coords: bool = True) -> dict:
    """Create compression encoding for zarr dataset storage.
    Creates Blosc compression configuration for all data variables and coordinates
    in the dataset using zstd compression with level 5.
    Args:
        ds (xr.Dataset): The xarray Dataset to create encoding for.
        store_floats_as_float32 (bool, optional): Whether to store floating point data as float32.
            Defaults to True.
        include_coords (bool, optional): Whether to include coordinates in the encoding.
            This is useful when appending to an existing store.
            Defaults to True.
    Returns:
        dict: Encoding dictionary with compression settings for each variable.
    """
    var_names = [*ds.data_vars, *ds.coords] if include_coords else ds.data_vars
    encoding = {var: {"compressors": BloscCodec(cname="zstd", clevel=5)} for var in var_names}
    if store_floats_as_float32:
        for var in ds.data_vars:
            if ds[var].dtype == "float64":
                encoding[var]["dtype"] = "float32"
    return encoding
--- a/src/entropice/darts.py
+++ b/src/entropice/darts.py
@ -0,0 +1,72 @@
 """Labels of Retrogressive-Thaw-Slumps (RTS).
 Assumes that the level 1 and level 2 DARTS features have been downloaded into $DATA_DIR / entropyc-rts / darts: https://arcticdata.io/catalog/view/doi:10.18739/A22B8VD7C
 Author: Tobias Hölzer
 Date: October 2025
 """
 from typing import Literal
 import cyclopts
 import geopandas as gpd
 from rich import pretty, print, traceback
 from rich.progress import track
 from stopuhr import stopwatch
 from entropice import grids
 from entropice.paths import dartsl2_cov_file, dartsl2_file, get_darts_rts_file
 traceback.install()
 pretty.install()
 def extract_darts_rts(grid: Literal["hex", "healpix"], level: int):
    """Extract RTS labels from DARTS dataset.
    Args:
        grid (Literal["hex", "healpix"]): The grid type to use.
        level (int): The grid level to use.
    """
    with stopwatch("Load data"):
        darts_l2 = gpd.read_parquet(dartsl2_file)
        darts_cov_l2 = gpd.read_parquet(dartsl2_cov_file)
        grid_gdf = grids.open(grid, level)
    with stopwatch("Extract RTS labels"):
        grid_l2 = grid_gdf.overlay(darts_l2.to_crs(grid_gdf.crs), how="intersection")
        grid_cov_l2 = grid_gdf.overlay(darts_cov_l2.to_crs(grid_gdf.crs), how="intersection")
    years = list(grid_cov_l2["year"].unique())
    for year in track(years, total=len(years), description="Processing years..."):
        with stopwatch("Processing RTS", log=False):
            subset = grid_l2[grid_l2["year"] == year]
            subset_cov = grid_cov_l2[grid_cov_l2["year"] == year]
            counts = subset.groupby("cell_id").size()
            grid_gdf[f"darts_{year}_rts_count"] = grid_gdf.cell_id.map(counts)
            areas = subset.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
            grid_gdf[f"darts_{year}_rts_area"] = grid_gdf.cell_id.map(areas)
            areas_cov = subset_cov.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
            grid_gdf[f"darts_{year}_covered_area"] = grid_gdf.cell_id.map(areas_cov)
            grid_gdf[f"darts_{year}_coverage"] = grid_gdf[f"darts_{year}_covered_area"] / grid_gdf.geometry.area
            grid_gdf[f"darts_{year}_rts_density"] = (
                grid_gdf[f"darts_{year}_rts_area"] / grid_gdf[f"darts_{year}_covered_area"]
            )
    output_path = get_darts_rts_file(grid, level)
    grid_gdf.to_parquet(output_path)
    print(f"Saved RTS labels to {output_path}")
    stopwatch.summary()
 def main():  # noqa: D103
    cyclopts.run(extract_darts_rts)
 if __name__ == "__main__":
    main()
--- a/steps/s1_1_era5/era5.py
+++ b/steps/s1_1_era5/era5.py
@ -1,3 +1,4 @@
 # ruff: noqa: PD011
 """Download and preprocess ERA5 data.
 Variables of Interest:
@ -9,6 +10,12 @@ Variables of Interest:
 - Surface sensible heat flux (sshf) [accum]
 - Lake ice bottom temperature (lblt) [instant]
 Snow Fall and Total precipitation are both further accumulated over a day -
 thus instead of taking the daily sum, only the last value of the day is taken.
 This is 00:00 of the next day, e.g.:
 tp_2020_06_23 <- tp_2020_06_24_00:00
 See: https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation#ERA5Land:datadocumentation-accumulationsAccumulations
 Naming patterns:
 - Instant Variables are downloaded already as statistically aggregated (lossy),
  therefore their names get the aggregation as suffix
@ -66,14 +73,12 @@ Author: Tobias Hölzer
 Date: June to October 2025
 """
-import os
+import cProfile
 import time
 from pathlib import Path
 from typing import Literal
 import cyclopts
 import dask.distributed as dd
 import geopandas as gpd
 import numpy as np
 import odc.geo
 import odc.geo.xr
@ -84,32 +89,16 @@ import xarray as xr
 from rich import pretty, print, traceback
 from rich.progress import track
 from shapely.geometry import LineString, Polygon
-from zarr.codecs import BloscCodec
+from stopuhr import stopwatch
-traceback.install(show_locals=True, suppress=[cyclopts, xr, pd])
+from entropice import codecs, grids, watermask
 from entropice.paths import get_era5_stores
 traceback.install(show_locals=True, suppress=[cyclopts, xr, pd, cProfile])
 pretty.install()
 cli = cyclopts.App()
 DATA_DIR = Path(os.environ.get("DATA_DIR", "data")) / "entropyc-rts"
 ERA5_DIR = DATA_DIR / "era5"
 DAILY_RAW_PATH = ERA5_DIR / "daily_raw.zarr"
 DAILY_ENRICHED_PATH = ERA5_DIR / "daily_enriched.zarr"
 MONTHLY_RAW_PATH = ERA5_DIR / "monthly_raw.zarr"
 YEARLY_RAW_PATH = ERA5_DIR / "yearly_aligned.zarr"
 SUMMER_RAW_PATH = ERA5_DIR / "summer_aligned.zarr"
 WINTER_RAW_PATH = ERA5_DIR / "winter_aligned.zarr"
 def _get_grid_paths(
    agg: Literal["daily", "monthly", "summer", "winter", "yearly"],
    grid: Literal["hex", "healpix"],
    level: int,
 ):
    gridname = f"permafrost_{grid}{level}"
    aligned_path = ERA5_DIR / f"{agg}_{gridname}.zarr"
    return aligned_path
 min_lat = 50
 max_lat = 83.7  # Ensures the right Chunks Size (90 - 64 / 10 + 0.1)
@ -145,24 +134,6 @@ accums = {
 }
 def create_encoding(ds: xr.Dataset):
    """Create compression encoding for zarr dataset storage.
    Creates Blosc compression configuration for all data variables and coordinates
    in the dataset using zstd compression with level 5.
    Args:
        ds (xr.Dataset): The xarray Dataset to create encoding for.
    Returns:
        dict: Encoding dictionary with compression settings for each variable.
    """
    # encoding = {var: {"compressors": BloscCodec(cname="zlib", clevel=9)} for var in ds.data_vars}
    encoding = {var: {"compressors": BloscCodec(cname="zstd", clevel=5)} for var in [*ds.data_vars, *ds.coords]}
    return encoding
 # ================
 # === Download ===
 # ================
@ -208,16 +179,18 @@ def download_daily_aggregated():
    daily_raw = xr.merge(
        [
            # Instant
-            era5.t2m.resample(time="1D").max().rename("t2m_max"),
+            era5.t2m.resample(time="1D").max().rename("t2m_max").astype(np.float32),
-            era5.t2m.resample(time="1D").min().rename("t2m_min"),
+            era5.t2m.resample(time="1D").min().rename("t2m_min").astype(np.float32),
-            era5.t2m.resample(time="1D").mean().rename("t2m_mean"),
+            era5.t2m.resample(time="1D").mean().rename("t2m_mean").astype(np.float32),
-            era5.snowc.resample(time="1D").mean().rename("snowc_mean"),
+            era5.snowc.resample(time="1D").mean().rename("snowc_mean").astype(np.float32),
-            era5.sde.resample(time="1D").mean().rename("sde_mean"),
+            era5.sde.resample(time="1D").mean().rename("sde_mean").astype(np.float32),
-            era5.lblt.resample(time="1D").max().rename("lblt_max"),
+            era5.lblt.resample(time="1D").max().rename("lblt_max").astype(np.float32),
            # Accum
-            era5.tp.resample(time="1D").sum().rename("tp"),
+            era5.sshf.resample(time="1D").sum(skipna=False).rename("sshf").astype(np.float32),
-            era5.sf.resample(time="1D").sum().rename("sf"),
+            # Precipitation and snow fall are special
-            era5.sshf.resample(time="1D").sum().rename("sshf"),
+            # Take only the last value of the day (00:00 of next day)
            era5.tp.resample(time="1D").first().shift(time=-1).rename("tp").astype(np.float32),
            era5.sf.resample(time="1D").first().shift(time=-1).rename("sf").astype(np.float32),
        ]
    )
@ -225,16 +198,18 @@ def download_daily_aggregated():
    daily_raw["t2m_max"].attrs = {"long_name": "Daily maximum 2 metre temperature", "units": "K"}
    daily_raw["t2m_min"].attrs = {"long_name": "Daily minimum 2 metre temperature", "units": "K"}
    daily_raw["t2m_mean"].attrs = {"long_name": "Daily mean 2 metre temperature", "units": "K"}
    daily_raw["tp"].attrs = {"long_name": "Daily total precipitation", "units": "m"}
    daily_raw["sf"].attrs = {"long_name": "Daily total snow fall", "units": "m"}
    daily_raw["snowc_mean"].attrs = {"long_name": "Daily mean snow cover", "units": "m"}
    daily_raw["sde_mean"].attrs = {"long_name": "Daily mean snow depth", "units": "m"}
    daily_raw["sshf"].attrs = {"long_name": "Daily total surface sensible heat flux", "units": "J/m²"}
    daily_raw["lblt_max"].attrs = {"long_name": "Daily maximum lake ice bottom temperature", "units": "K"}
    daily_raw["tp"].attrs = {"long_name": "Daily total precipitation", "units": "m"}  # Units are rather m^3 / m^2
    daily_raw["sf"].attrs = {"long_name": "Daily total snow fall", "units": "m"}  # Units are rather m^3 / m^2
    daily_raw["sshf"].attrs = {"long_name": "Daily total surface sensible heat flux", "units": "J/m²"}
    daily_raw = daily_raw.odc.assign_crs("epsg:4326")
    daily_raw = daily_raw.drop_vars(["surface", "number", "depthBelowLandLayer"])
-    daily_raw.to_zarr(DAILY_RAW_PATH, mode="w", encoding=create_encoding(daily_raw), consolidated=False)
+    daily_store = get_era5_stores("daily")
    print(f"Saving downloaded and daily aggregated ERA5 data to {daily_store}.")
    daily_raw.to_zarr(daily_store, mode="w", encoding=codecs.from_ds(daily_raw), consolidated=False)
@cli.command
@ -252,7 +227,7 @@ def download():
        print(client)
        print(client.dashboard_link)
        download_daily_aggregated()
-        print(f"Downloaded and aggregated ERA5 data to {DAILY_RAW_PATH.resolve()}.")
+        print("Downloaded and aggregated ERA5 data.")
 # ============================
@ -275,36 +250,62 @@ def daily_enrich():
    - Snow isolation index
    """
-    daily = xr.open_zarr(DAILY_RAW_PATH, consolidated=False).set_coords("spatial_ref")
+    daily_store = get_era5_stores("daily")
    daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
    assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
    # For better dask performance, all variables are written immediately after calculation
    # The smart scheduling which could performantly handle more is according to docs not yet implemented.
    # See https://docs.xarray.dev/en/stable/user-guide/dask.html#best-practices point 3 or https://github.com/dask/dask/issues/874
    def _store(v: str):
        nonlocal daily
        encoding = codecs.from_ds(daily[[v]], include_coords=False)
        print(f"Storing enriched daily variable {v} to {daily_store}...")
        with stopwatch("Storing enriched daily variable"):
            daily[[v]].to_zarr(daily_store, mode="a", encoding=encoding, consolidated=False)
        daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
    # Formulas based on Groeke et. al. (2025) Stochastic Weather generation...
    daily["t2m_avg"] = (daily.t2m_max + daily.t2m_min) / 2
    daily.t2m_avg.attrs = {"long_name": "Daily average 2 metre temperature", "units": "K"}
    _store("t2m_avg")
    daily["t2m_range"] = daily.t2m_max - daily.t2m_min
    daily.t2m_range.attrs = {"long_name": "Daily range of 2 metre temperature", "units": "K"}
    _store("t2m_range")
    daily["t2m_skew"] = (daily.t2m_mean - daily.t2m_min) / daily.t2m_range
    daily.t2m_skew.attrs = {"long_name": "Daily skewness of 2 metre temperature"}
    _store("t2m_skew")
    daily["thawing_degree_days"] = (daily.t2m_avg - 273.15).clip(min=0)
    daily.thawing_degree_days.attrs = {"long_name": "Thawing degree days", "units": "K"}
    _store("thawing_degree_days")
    daily["freezing_degree_days"] = (273.15 - daily.t2m_avg).clip(min=0)
    daily.freezing_degree_days.attrs = {"long_name": "Freezing degree days", "units": "K"}
    _store("freezing_degree_days")
    daily["thawing_days"] = (daily.t2m_avg > 273.15).astype(int)
    daily.thawing_days.attrs = {"long_name": "Thawing days"}
    _store("thawing_days")
    daily["freezing_days"] = (daily.t2m_avg < 273.15).astype(int)
    daily.freezing_days.attrs = {"long_name": "Freezing days"}
    _store("freezing_days")
-    daily["precipitation_occurrences"] = (daily.tp > 0).astype(int)
+    daily["precipitation_occurrences"] = (daily.tp > 0.001).astype(int)
    daily.precipitation_occurrences.attrs = {"long_name": "Precipitation occurrences"}
-    daily["snowfall_occurrences"] = (daily.sf > 0).astype(int)
+    _store("precipitation_occurrences")
    daily["snowfall_occurrences"] = (daily.sf > 0.001).astype(int)
    daily.snowfall_occurrences.attrs = {"long_name": "Snowfall occurrences"}
    _store("snowfall_occurrences")
    daily["naive_snow_isolation"] = daily.snowc_mean * daily.sde_mean
    daily.naive_snow_isolation.attrs = {"long_name": "Naive snow isolation"}
-
+    _store("naive_snow_isolation")
    daily.to_zarr(DAILY_ENRICHED_PATH, mode="w", encoding=create_encoding(daily), consolidated=False)
 def monthly_aggregate():
@ -318,7 +319,8 @@ def monthly_aggregate():
    The aggregated monthly data is saved to a zarr file for further processing.
    """
-    daily = xr.open_zarr(DAILY_ENRICHED_PATH, consolidated=False)
+    daily_store = get_era5_stores("daily")
    daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
    assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
    daily = daily.sel(time=slice(min_time, max_time))
@ -348,8 +350,10 @@ def monthly_aggregate():
        monthly_accums.append(agg)
    monthly = xr.merge(monthly_instants + monthly_accums)
-    monthly = monthly.chunk({"time": len(monthly.time), "latitude": 64, "longitude": 64})
+    monthly = monthly.chunk({"time": len(monthly.time), "latitude": 256, "longitude": 256})  # ~ 100Mb / chunk for f32
-    monthly.to_zarr(MONTHLY_RAW_PATH, mode="w", encoding=create_encoding(monthly), consolidated=False)
+    monthly_store = get_era5_stores("monthly")
    print(f"Saving monthly aggregated ERA5 data to {monthly_store}.")
    monthly.to_zarr(monthly_store, mode="w", encoding=codecs.from_ds(monthly), consolidated=False)
 def multi_monthly_aggregate(monthly: xr.Dataset, n: int = 12) -> xr.Dataset:
@ -407,47 +411,12 @@ def multi_monthly_aggregate(monthly: xr.Dataset, n: int = 12) -> xr.Dataset:
        "link": "https://tc.copernicus.org/articles/11/989/2017/tc-11-989-2017.pdf",
    }
-    multimonthly = multimonthly.chunk({"time": len(multimonthly.time), "latitude": 64, "longitude": 64})
+    multimonthly = multimonthly.chunk(
        {"time": len(multimonthly.time), "latitude": 128, "longitude": 1024}
    )  # ~36Mb / chunk for f64
    return multimonthly
 def yearly_and_seasonal_aggregate():
    """Aggregate monthly ERA5 data to yearly resolution with seasonal splits.
    Takes monthly aggregated data and creates yearly aggregates using a shifted
    calendar (October to September) to better capture Arctic seasonal patterns.
    Creates separate aggregates for full year, winter (Oct-Apr), and summer
    (May-Sep) periods.
    The first and last incomplete years are excluded from the analysis.
    Winter months are defined as months 1-7 in the shifted calendar,
    and summer months are 8-12.
    The final dataset includes yearly, winter, and summer aggregates for all
    climate variables, saved to a zarr file.
    """
    monthly = xr.open_zarr(MONTHLY_RAW_PATH, consolidated=False).set_coords("spatial_ref")
    assert "time" in monthly.dims, f"Expected dim 'time' to be in {monthly.dims=}"
    # "Shift" the calendar by slicing the first Jan-Sep and the last Oct-Dec months
    first_year = monthly.time.dt.year.min().item()
    last_year = monthly.time.dt.year.max().item()
    monthly = monthly.sel(time=slice(f"{first_year}-10-01", f"{last_year}-09-30"))
    yearly = multi_monthly_aggregate(monthly, n=12)
    yearly = derive_yearly_variables(yearly)
    yearly.to_zarr(YEARLY_RAW_PATH, mode="w", encoding=create_encoding(yearly), consolidated=False)
    summer_winter = multi_monthly_aggregate(monthly, n=6)
    summer = summer_winter.sel(time=summer_winter.time.dt.month == 4)
    summer.to_zarr(SUMMER_RAW_PATH, mode="w", encoding=create_encoding(summer), consolidated=False)
    winter = summer_winter.sel(time=summer_winter.time.dt.month == 10)
    winter.to_zarr(WINTER_RAW_PATH, mode="w", encoding=create_encoding(winter), consolidated=False)
 def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
    """Derive additional variables from daily data and add them to the yearly dataset.
@ -459,7 +428,8 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
    """
    assert "time" in yearly.dims, f"Expected dim 'time' to be in {yearly.dims=}"
-    daily = xr.open_zarr(DAILY_ENRICHED_PATH, consolidated=False).set_coords("spatial_ref")
+    daily_store = get_era5_stores("daily")
    daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
    assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
    daily = daily.sel(time=slice(min_time, max_time))
    # ? Note: The functions do not really account for leap years
@ -476,7 +446,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
    # This assumes that the 01-01 is almost everywhere one of the coldest days in the year
    first_thaw_day = daily.thawing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
    first_thaw_day = first_thaw_day.where(~never_thaws).rename("day_of_first_thaw").rename(year="time")
-    first_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_thaw_day.time.values])  # noqa: PD011
+    first_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_thaw_day.time.values])
    first_thaw_day.attrs = {"long_name": "Day of first thaw in year", "units": "day of year"}
    yearly["day_of_first_thaw"] = first_thaw_day.sel(time=yearly.time)
@ -484,7 +454,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
        n_days_in_year - daily.thawing_days[::-1].groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
    )
    last_thaw_day = last_thaw_day.where(~never_thaws).rename("day_of_last_thaw").rename(year="time")
-    last_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_thaw_day.time.values])  # noqa: PD011
+    last_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_thaw_day.time.values])
    last_thaw_day.attrs = {"long_name": "Day of last thaw in year", "units": "day of year"}
    yearly["day_of_last_thaw"] = last_thaw_day.sel(time=yearly.time)
@ -506,7 +476,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
    first_freezing_day = daily_shifted.freezing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
    first_freezing_day = first_freezing_day.where(~never_freezes).rename("day_of_first_freeze").rename(year="time")
-    first_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_freezing_day.time.values])  # noqa: PD011
+    first_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_freezing_day.time.values])
    first_freezing_day.attrs = {"long_name": "Day of first freeze in year", "units": "day of year"}
    yearly["day_of_first_freeze"] = first_freezing_day.sel(time=yearly.time)
@ -516,7 +486,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
        + 1
    )
    last_freezing_day = last_freezing_day.where(~never_freezes).rename("day_of_last_freeze").rename(year="time")
-    last_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_freezing_day.time.values])  # noqa: PD011
+    last_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_freezing_day.time.values])
    last_freezing_day.attrs = {"long_name": "Day of last freeze in year", "units": "day of year"}
    yearly["day_of_last_freeze"] = last_freezing_day.sel(time=yearly.time)
@ -528,6 +498,53 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
    return yearly
 def yearly_and_seasonal_aggregate():
    """Aggregate monthly ERA5 data to yearly resolution with seasonal splits.
    Takes monthly aggregated data and creates yearly aggregates using a shifted
    calendar (October to September) to better capture Arctic seasonal patterns.
    Creates separate aggregates for full year, winter (Oct-Apr), and summer
    (May-Sep) periods.
    The first and last incomplete years are excluded from the analysis.
    Winter months are defined as months 1-7 in the shifted calendar,
    and summer months are 8-12.
    The final dataset includes yearly, winter, and summer aggregates for all
    climate variables, saved to a zarr file.
    """
    monthly_store = get_era5_stores("monthly")
    monthly = xr.open_zarr(monthly_store, consolidated=False).set_coords("spatial_ref")
    assert "time" in monthly.dims, f"Expected dim 'time' to be in {monthly.dims=}"
    # "Shift" the calendar by slicing the first Jan-Sep and the last Oct-Dec months
    first_year = monthly.time.dt.year.min().item()
    last_year = monthly.time.dt.year.max().item()
    monthly = monthly.sel(time=slice(f"{first_year}-10-01", f"{last_year}-09-30"))
    yearly = multi_monthly_aggregate(monthly, n=12)
    yearly = derive_yearly_variables(yearly)
    yearly = yearly.chunk({"time": len(yearly.time), "latitude": 256, "longitude": 1024})  # ~36Mb / chunk for f32
    yearly_store = get_era5_stores("yearly")
    print(f"Saving yearly aggregated ERA5 data to {yearly_store}.")
    yearly.to_zarr(yearly_store, mode="w", encoding=codecs.from_ds(yearly), consolidated=False)
    summer_winter = multi_monthly_aggregate(monthly, n=6)
    summer = summer_winter.sel(time=summer_winter.time.dt.month == 4)
    summer = summer.chunk({"time": len(summer.time), "latitude": 256, "longitude": 1024})  # ~36Mb / chunk for f32
    summer_store = get_era5_stores("summer")
    print(f"Saving summer aggregated ERA5 data to {summer_store}.")
    summer.to_zarr(summer_store, mode="w", encoding=codecs.from_ds(summer), consolidated=False)
    winter = summer_winter.sel(time=summer_winter.time.dt.month == 10)
    winter = winter.chunk({"time": len(winter.time), "latitude": 256, "longitude": 1024})  # ~36Mb / chunk for f32
    winter_store = get_era5_stores("winter")
    print(f"Saving winter aggregated ERA5 data to {winter_store}.")
    winter.to_zarr(winter_store, mode="w", encoding=codecs.from_ds(winter), consolidated=False)
@cli.command
 def enrich(n_workers: int = 10, monthly: bool = True, yearly: bool = True, daily: bool = True):
    """Enrich data and pPerform temporal aggregation of ERA5 data using Dask cluster.
@ -591,15 +608,16 @@ def _check_geom(geobox: odc.geo.geobox.GeoBox, geom: odc.geo.Geometry) -> bool:
    return (roix.stop - roix.start) > 1 and (roiy.stop - roiy.start) > 1
-def extract_cell_data(yearly: xr.Dataset, geom: Polygon):
+@stopwatch("Getting corrected geometries", log=False)
-    """Extract ERA5 data for a specific grid cell geometry.
+def _get_corrected_geoms(geom: Polygon, gbox: odc.geo.geobox.GeoBox) -> list[odc.geo.Geometry]:
-
+    """Get corrected geometries for antimeridian-crossing polygons.
    Extracts and spatially averages ERA5 data within the bounds of a grid cell.
    Handles antimeridian-crossing cells by splitting them appropriately.
    Args:
-        yearly (xr.Dataset): Yearly aggregated ERA5 dataset.
+        geom (Polygon): Input polygon geometry.
-        geom (Polygon): Polygon geometry of the grid cell.
+        gbox (odc.geo.geobox.GeoBox): GeoBox for spatial reference.
    Returns:
        list[odc.geo.Geometry]: List of corrected, georeferenced geometries.
    """
    # cell.geometry is a shapely Polygon
@ -608,28 +626,73 @@ def extract_cell_data(yearly: xr.Dataset, geom: Polygon):
    # Split geometry in case it crossed antimeridian
    else:
        geoms = _split_antimeridian_cell(geom)
-    cell_data = []
+
-    for geom in geoms:
+    geoms = [odc.geo.Geometry(g, crs="epsg:4326") for g in geoms]
-        geom = odc.geo.Geometry(geom, crs="epsg:4326")
+    geoms = filter(lambda g: _check_geom(gbox, g), geoms)
-        if not _check_geom(yearly.odc.geobox, geom):
+    return geoms
-            continue
+
-        cell_data.append(yearly.odc.crop(geom).drop_vars("spatial_ref").mean(["latitude", "longitude"]))
+
-    if len(cell_data) == 0:
+@stopwatch("Extracting cell data", log=False)
 def extract_cell_data(ds: xr.Dataset, geoms: list[odc.geo.Geometry]) -> xr.Dataset | bool:
    """Extract ERA5 data for a specific grid cell geometry.
    Extracts and spatially averages ERA5 data within the bounds of a grid cell.
    Handles antimeridian-crossing cells by splitting them appropriately.
    Args:
        ds (xr.Dataset): An ERA5 dataset.
        geoms (list[odc.geo.Geometry]): List of (valid) geometries of the grid cell.
    """
    if len(geoms) == 0:
        return False
-    elif len(cell_data) == 1:
+    elif len(geoms) == 1:
-        cell_data = cell_data[0]
+        return ds.odc.crop(geoms[0]).drop_vars("spatial_ref").mean(["latitude", "longitude"], skipna=True).compute()
    else:
-        cell_data = xr.concat(cell_data, dim="part").mean("part")
+        parts = [
-    cell_data = cell_data.compute()
+            ds.odc.crop(geom).drop_vars("spatial_ref").mean(["latitude", "longitude"], skipna=True) for geom in geoms
-    return cell_data
+        ]
        parts = [part for part in parts if part.latitude.size > 0 and part.longitude.size > 0]
        if len(parts) == 0:
            raise ValueError("No valid parts found for geometry. This should not happen!")
        elif len(parts) == 1:
            return parts[0].compute()
        else:
            return xr.concat(parts, dim="part").mean("part", skipna=True).compute()
 def _correct_longs(ds: xr.Dataset) -> xr.Dataset:
    return ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180)).sortby("longitude")
@stopwatch("Creating aligned dataset", log=False)
 def _create_aligned(
    ds: xr.Dataset, data: dict[str, np.ndarray], grid: Literal["hex", "healpix"], level: int
 ) -> xr.Dataset:
    cell_ids = grids.get_cell_ids(grid, level)
    data_vars = {var: (["cell_ids", "time"], values) for var, values in data.items()}
    aligned = xr.Dataset(
        data_vars,
        coords={"cell_ids": cell_ids, "time": ds.time},
    )
    gridinfo = {
        "grid_name": "h3" if grid == "hex" else grid,
        "level": level,
    }
    if grid == "healpix":
        gridinfo["indexing_scheme"] = "nested"
    aligned.cell_ids.attrs = gridinfo
    for var in ds.data_vars:
        aligned[var].attrs = ds[var].attrs
    aligned = aligned.chunk({"cell_ids": min(len(aligned.cell_ids), 10000), "time": len(aligned.time)})
    return aligned
@cli.command
 def spatial_agg(
    grid: Literal["hex", "healpix"],
    level: int,
    agg: Literal["summer", "winter", "yearly"] = "yearly",
    n_workers: int = 10,
 ):
    """Perform spatial aggregation of ERA5 data to grid cells.
@ -640,69 +703,73 @@ def spatial_agg(
    Args:
        grid ("hex" | "healpix"): Grid type.
        level (int): Grid resolution level.
        agg ("summer" | "winter" | "yearly"): Type of aggregation to perform. Defaults to yearly.
        n_workers (int, optional): Number of parallel workers to use. Defaults to 10.
    """
-    gridname = f"permafrost_{grid}{level}"
+    grid_gdf = grids.open(grid, level)
-    agg_grid_path = _get_grid_paths(agg, grid, level)
+    # ? Mask out water, since we don't want to aggregate over oceans
-    grid_df = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")
+    grid_gdf = watermask.clip_grid(grid_gdf)
    # Create an empty zarr array with the right dimensions
    if agg == "summer":
        agg_data_path = SUMMER_RAW_PATH
    elif agg == "winter":
        agg_data_path = WINTER_RAW_PATH
    elif agg == "yearly":
        agg_data_path = YEARLY_RAW_PATH
    else:
        raise ValueError(f"Unknown aggregation type: {agg}")
    agg_raw = (
        xr.open_zarr(agg_data_path, consolidated=False, decode_timedelta=False)
        .set_coords("spatial_ref")
        .drop_vars(["surface", "number", "depthBelowLandLayer"])
        .load()
    )
    assert {"latitude", "longitude", "time"} == set(agg_raw.dims), (
        f"Expected dims ('latitude', 'longitude', 'time'), got {agg_raw.dims}"
    )
    assert agg_raw.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {agg_raw.odc.crs}"
-    # Convert lons to -180 to 180 instead of 0 to 360
+    summer_unaligned_store = get_era5_stores("summer")
-    agg_raw = agg_raw.assign_coords(longitude=(((agg_raw.longitude + 180) % 360) - 180)).sortby("longitude")
+    winter_unaligned_store = get_era5_stores("winter")
    yearly_unaligned_store = get_era5_stores("yearly")
    with stopwatch("Loading summer ERA5 data"):
        summer_unaligned = xr.open_zarr(summer_unaligned_store, consolidated=False).set_coords("spatial_ref")
        assert {"latitude", "longitude", "time"} == set(summer_unaligned.dims)
        assert summer_unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {summer_unaligned.odc.crs}"
        summer_unaligned = _correct_longs(summer_unaligned)
    with stopwatch("Loading winter ERA5 data"):
        winter_unaligned = xr.open_zarr(winter_unaligned_store, consolidated=False).set_coords("spatial_ref")
        assert {"latitude", "longitude", "time"} == set(winter_unaligned.dims)
        assert winter_unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {winter_unaligned.odc.crs}"
        winter_unaligned = _correct_longs(winter_unaligned)
    with stopwatch("Loading yearly ERA5 data"):
        yearly_unaligned = xr.open_zarr(yearly_unaligned_store, consolidated=False).set_coords("spatial_ref")
        assert {"latitude", "longitude", "time"} == set(yearly_unaligned.dims)
        assert yearly_unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {yearly_unaligned.odc.crs}"
        yearly_unaligned = _correct_longs(yearly_unaligned)
-    # ? Converting cell IDs from hex strings to integers for xdggs compatibility
+    summer_data = {
-    cells = [int(cid, 16) for cid in grid_df.cell_id.to_list()]
+        var: np.full((len(grid_gdf), len(summer_unaligned.time)), np.nan, dtype=np.float32)
-
+        for var in summer_unaligned.data_vars
    agg_aligned = (
        xr.zeros_like(agg_raw.isel(latitude=0, longitude=0).drop_vars(["latitude", "longitude"]))
        .expand_dims({"cell_ids": cells})
        .chunk({"cell_ids": min(len(grid_df), 10000), "time": len(agg_raw.time)})
    )
    agg_aligned.cell_ids.attrs = {
        "grid_name": "h3" if grid == "hex" else grid,
        "level": level,
    }
-    if grid == "healpix":
+    winter_data = {
-        agg_aligned.cell_ids.attrs["indexing_scheme"] = "nested"
+        var: np.full((len(grid_gdf), len(winter_unaligned.time)), np.nan, dtype=np.float32)
-
+        for var in winter_unaligned.data_vars
-    from stopuhr import stopwatch
+    }
-
+    yearly_data = {
-    for _, row in track(
+        var: np.full((len(grid_gdf), len(yearly_unaligned.time)), np.nan, dtype=np.float32)
-        grid_df.to_crs("epsg:4326").iterrows(),
+        for var in yearly_unaligned.data_vars
-        total=len(grid_df),
+    }
    for i, (_, row) in track(
        enumerate(grid_gdf.to_crs("epsg:4326").iterrows()),
        total=len(grid_gdf),
        description="Spatially aggregating ERA5 data...",
    ):
-        cell_id = int(row.cell_id, 16)
+        geoms = _get_corrected_geoms(row.geometry, summer_unaligned.odc.geobox)
-        with stopwatch("Extracting cell data", log=False):
+        if len(geoms) == 0:
-            cell_data = extract_cell_data(agg_raw, row.geometry)
+            print(f"Warning: No valid geometry for cell {row.cell_id}.")
        if cell_data is False:
            print(f"Warning: No data found for cell {cell_id}, skipping.")
            continue
        with stopwatch("Assigning cell data", log=False):
            agg_aligned.loc[{"cell_ids": cell_id}] = cell_data
-    agg_aligned.to_zarr(agg_grid_path, mode="w", consolidated=False, encoding=create_encoding(agg_aligned))
+        cell_data = extract_cell_data(summer_unaligned, geoms)
        for var in summer_unaligned.data_vars:
            summer_data[var][i, :] = cell_data[var].values.astype()
        cell_data = extract_cell_data(winter_unaligned, geoms)
        for var in winter_unaligned.data_vars:
            winter_data[var][i, :] = cell_data[var].values
        cell_data = extract_cell_data(yearly_unaligned, geoms)
        for var in yearly_unaligned.data_vars:
            yearly_data[var][i, :] = cell_data[var].values
    summer = _create_aligned(summer_unaligned, summer_data, grid, level)
    winter = _create_aligned(winter_unaligned, winter_data, grid, level)
    yearly = _create_aligned(yearly_unaligned, yearly_data, grid, level)
    summer_store = get_era5_stores("summer", grid, level)
    winter_store = get_era5_stores("winter", grid, level)
    yearly_store = get_era5_stores("yearly", grid, level)
    summer.to_zarr(summer_store, mode="w", consolidated=False, encoding=codecs.from_ds(summer))
    winter.to_zarr(winter_store, mode="w", consolidated=False, encoding=codecs.from_ds(winter))
    yearly.to_zarr(yearly_store, mode="w", consolidated=False, encoding=codecs.from_ds(yearly))
    print("Finished spatial matching.")
    stopwatch.summary()
--- a/steps/s0_0_grids/create_grid.py
+++ b/steps/s0_0_grids/create_grid.py
@ -4,8 +4,6 @@ Author: Tobias Hölzer
 Date: 09. June 2025
 """
 import os
 from pathlib import Path
 from typing import Literal
 import cartopy.crs as ccrs
@ -25,14 +23,45 @@ from shapely.ops import transform
 from stopuhr import stopwatch
 from xdggs.healpix import HealpixInfo
 from entropice.paths import get_grid_file, get_grid_viz_file, watermask_file
 traceback.install()
 pretty.install()
-DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
+
-GRIDS_DIR = DATA_DIR / "grids"
+def open(grid: Literal["hex", "healpix"], level: int):
-FIGURES_DIR = DATA_DIR / "figures"
+    """Open a saved grid from parquet file.
-GRIDS_DIR.mkdir(parents=True, exist_ok=True)
+
-FIGURES_DIR.mkdir(parents=True, exist_ok=True)
+    Args:
        grid (Literal["hex", "healpix"]): The grid type to use.
        level (int): The grid level to use.
    Returns:
        GeoDataFrame: The loaded grid.
    """
    gridfile = get_grid_file(grid, level)
    grid = gpd.read_parquet(gridfile)
    return grid
 def get_cell_ids(grid: Literal["hex", "healpix"], level: int):
    """Get the cell IDs of a saved grid.
    Args:
        grid (Literal["hex", "healpix"]): The grid type to use.
        level (int): The grid level to use.
    Returns:
        List[str]: The list of cell IDs.
    """
    grid_gdf = open(grid, level)
    cell_ids = grid_gdf["cell_id"].tolist()
    if grid == "hex":
        # ? Converting cell IDs from hex strings to integers for xdggs compatibility
        cell_ids = [int(cid, 16) for cid in cell_ids]
    return cell_ids
@stopwatch("Create a global hex grid")
@ -131,7 +160,7 @@ def filter_permafrost_grid(grid: gpd.GeoDataFrame):
    grid = grid.to_crs("EPSG:3413")
    # Filter out non-land areas (e.g., oceans)
-    water_mask = gpd.read_file(DATA_DIR / "simplified-water-polygons-split-3857/simplified_water_polygons.shp")
+    water_mask = gpd.read_file(watermask_file)
    water_mask = water_mask.to_crs("EPSG:3413")
    ov = gpd.overlay(grid, water_mask, how="intersection")
@ -226,14 +255,14 @@ def cli(grid: Literal["hex", "healpix"], level: int):
        print("No valid grid cells found.")
        return
-    grid_file = GRIDS_DIR / f"permafrost_{grid}{level}_grid.parquet"
+    grid_file = get_grid_file(grid, level)
    grid_gdf.to_parquet(grid_file)
-    print(f"Saved to {grid_file.resolve()}")
+    print(f"Saved to {grid_file}")
    fig = vizualize_grid(grid_gdf, grid, level)
-    fig_file = FIGURES_DIR / f"permafrost_{grid}{level}_grid.png"
+    fig_file = get_grid_viz_file(grid, level)
    fig.savefig(fig_file, dpi=300)
-    print(f"Saved figure to {fig_file.resolve()}")
+    print(f"Saved figure to {fig_file}")
    plt.close(fig)
--- a/src/entropice/paths.py
+++ b/src/entropice/paths.py
@ -0,0 +1,75 @@
 # ruff: noqa: D103
 """Paths for entropice data storage."""
 import os
 from pathlib import Path
 from typing import Literal
 DATA_DIR = Path(os.environ.get("DATA_DIR", "data")).resolve() / "entropice"
 GRIDS_DIR = DATA_DIR / "grids"
 FIGURES_DIR = DATA_DIR / "figures"
 DARTS_DIR = DATA_DIR / "darts"
 ERA5_DIR = DATA_DIR / "era5"
 EMBEDDINGS_DIR = DATA_DIR / "embeddings"
 WATERMASK_DIR = DATA_DIR / "watermask"
 GRIDS_DIR.mkdir(parents=True, exist_ok=True)
 FIGURES_DIR.mkdir(parents=True, exist_ok=True)
 DARTS_DIR.mkdir(parents=True, exist_ok=True)
 ERA5_DIR.mkdir(parents=True, exist_ok=True)
 EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
 WATERMASK_DIR.mkdir(parents=True, exist_ok=True)
 watermask_file = DATA_DIR / "simplified_water_polygons.shp"
 dartsl2_file = DARTS_DIR / "DARTS_NitzeEtAl_v1-2_features_2018-2023_level2.parquet"
 dartsl2_cov_file = DARTS_DIR / "DARTS_NitzeEtAl_v1-2_coverage_2018-2023_level2.parquet"
 def _get_gridname(grid: Literal["hex", "healpix"], level: int) -> str:
    return f"permafrost_{grid}{level}"
 def get_grid_file(grid: Literal["hex", "healpix"], level: int) -> Path:
    gridname = _get_gridname(grid, level)
    gridfile = GRIDS_DIR / f"{gridname}_grid.parquet"
    return gridfile
 def get_grid_viz_file(grid: Literal["hex", "healpix"], level: int) -> Path:
    gridname = _get_gridname(grid, level)
    vizfile = FIGURES_DIR / f"{gridname}_grid.png"
    return vizfile
 def get_darts_rts_file(grid: Literal["hex", "healpix"], level: int) -> Path:
    gridname = _get_gridname(grid, level)
    rtsfile = DARTS_DIR / f"{gridname}_rts.parquet"
    return rtsfile
 def get_annual_embeddings_file(grid: Literal["hex", "healpix"], level: int, year: int) -> Path:
    gridname = _get_gridname(grid, level)
    embfile = EMBEDDINGS_DIR / f"{gridname}_embeddings-{year}.parquet"
    return embfile
 def get_embeddings_store(grid: Literal["hex", "healpix"], level: int) -> Path:
    gridname = _get_gridname(grid, level)
    embstore = EMBEDDINGS_DIR / f"{gridname}_embeddings.zarr"
    return embstore
 def get_era5_stores(
    agg: Literal["daily", "monthly", "summer", "winter", "yearly"],
    grid: Literal["hex", "healpix"] | None = None,
    level: int | None = None,
 ):
    if grid is None or level is None:
        return ERA5_DIR / f"{agg}_climate_unaligned.zarr"
    gridname = _get_gridname(grid, level)
    aligned_path = ERA5_DIR / f"{gridname}_{agg}_climate.zarr"
    return aligned_path
--- a/src/entropice/watermask.py
+++ b/src/entropice/watermask.py
@ -0,0 +1,32 @@
 """Helpers for the watermask."""
 import geopandas as gpd
 from entropice.paths import watermask_file
 def open():
    """Open the watermask shapefile.
    Returns:
        GeoDataFrame: The watermask polygons.
    """
    watermask = gpd.read_file(watermask_file)
    return watermask
 def clip_grid(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """Clip the input GeoDataFrame with the watermask.
    Args:
        gdf (gpd.GeoDataFrame): The input GeoDataFrame to clip.
    Returns:
        gpd.GeoDataFrame: The clipped GeoDataFrame.
    """
    watermask = open()
    watermask = watermask.to_crs("EPSG:3413")
    gdf = gdf.overlay(watermask, how="difference")
    return gdf
--- a/steps/s0_1_rts/rts.py
+++ b/steps/s0_1_rts/rts.py
@ -1,66 +0,0 @@
 """Labels of Retrogressive-Thaw-Slumps (RTS).
 Assumes that the level 1 and level 2 DARTS features have been downloaded into $DATA_DIR / entropyc-rts / darts: https://arcticdata.io/catalog/view/doi:10.18739/A22B8VD7C
 Author: Tobias Hölzer
 Date: October 2025
 """
 import os
 from pathlib import Path
 from typing import Literal
 import cyclopts
 import geopandas as gpd
 from rich.progress import track
 DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
 LEVEL2_PATH = DATA_DIR / "darts" / "DARTS_NitzeEtAl_v1-2_features_2018-2023_level2.parquet"
 LEVEL2_COV_PATH = DATA_DIR / "darts" / "DARTS_NitzeEtAl_v1-2_coverage_2018-2023_level2.parquet"
 def extract_darts_rts(grid: Literal["hex", "healpix"], level: int):
    """Extract RTS labels from DARTS dataset.
    Args:
        grid (Literal["hex", "healpix"]): The grid type to use.
        level (int): The grid level to use.
    """
    darts_l2 = gpd.read_parquet(LEVEL2_PATH)
    darts_cov_l2 = gpd.read_parquet(LEVEL2_COV_PATH)
    gridname = f"permafrost_{grid}{level}"
    grid = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")
    grid_l2 = grid.overlay(darts_l2.to_crs(grid.crs), how="intersection")
    grid_cov_l2 = grid.overlay(darts_cov_l2.to_crs(grid.crs), how="intersection")
    years = list(grid_cov_l2["year"].unique())
    for year in track(years, total=len(years), description="Processing years..."):
        subset = grid_l2[grid_l2["year"] == year]
        subset_cov = grid_cov_l2[grid_cov_l2["year"] == year]
        counts = subset.groupby("cell_id").size()
        grid[f"darts_{year}_rts_count"] = grid.cell_id.map(counts)
        areas = subset.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
        grid[f"darts_{year}_rts_area"] = grid.cell_id.map(areas)
        areas_cov = subset_cov.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
        grid[f"darts_{year}_covered_area"] = grid.cell_id.map(areas_cov)
        grid[f"darts_{year}_coverage"] = grid[f"darts_{year}_covered_area"] / grid.geometry.area
        grid[f"darts_{year}_rts_density"] = grid[f"darts_{year}_rts_area"] / grid[f"darts_{year}_covered_area"]
    output_path = DATA_DIR / f"darts/{gridname}_darts.parquet"
    grid.to_parquet(output_path)
    print(f"Saved RTS labels to {output_path}")
 def main():  # noqa: D103
    cyclopts.run(extract_darts_rts)
 if __name__ == "__main__":
    main()
--- a/steps/s1_0_alphaearth/alphaearth.ipynb
+++ b/steps/s1_0_alphaearth/alphaearth.ipynb
--- a/uv.lock
+++ b/uv.lock
@ -1199,7 +1199,7 @@ wheels = [
 ]
 [[package]]
-name = "entropic-perma-risk"
+name = "entropice"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
		`@ -0,0 +1,2 @@`
							`def hello() -> str:`
							`return "Hello from entropice!"`