This commit is contained in:
Tobias Hölzer 2025-10-26 18:28:46 +01:00
parent 3ad332b5a8
commit eeab8fff1e
16 changed files with 536 additions and 943 deletions

View file

@ -1,5 +1,5 @@
[project] [project]
name = "entropic-perma-risk" name = "entropice"
version = "0.1.0" version = "0.1.0"
description = "Add your description here" description = "Add your description here"
readme = "README.md" readme = "README.md"
@ -49,10 +49,14 @@ dependencies = [
] ]
[project.scripts] [project.scripts]
create-grid = "steps.s0_0_grids.create_grid:main" create-grid = "entropice.grids:main"
rts = "steps.s0_1_rts.rts:main" darts = "entropice.darts:main"
alpha-earth = "steps.s1_0_alphaearth.alphaearth:main" alpha-earth = "entropice.alphaearth:main"
era5 = "steps.s1_1_era5.era5:cli" era5 = "entropice.era5:cli"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.uv] [tool.uv]
package = true package = true

View file

@ -0,0 +1,2 @@
def hello() -> str:
return "Hello from entropice!"

View file

@ -4,9 +4,7 @@ Author: Tobias Hölzer
Date: October 2025 Date: October 2025
""" """
import os
import warnings import warnings
from pathlib import Path
from typing import Literal from typing import Literal
import cyclopts import cyclopts
@ -16,9 +14,13 @@ import geopandas as gpd
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import xarray as xr import xarray as xr
import xdggs
from rich import pretty, print, traceback from rich import pretty, print, traceback
from rich.progress import track from rich.progress import track
from entropice import codecs, grids
from entropice.paths import get_annual_embeddings_file, get_embeddings_store
# Filter out the GeoDataFrame.swapaxes deprecation warning # Filter out the GeoDataFrame.swapaxes deprecation warning
warnings.filterwarnings("ignore", message=".*GeoDataFrame.swapaxes.*", category=FutureWarning) warnings.filterwarnings("ignore", message=".*GeoDataFrame.swapaxes.*", category=FutureWarning)
@ -26,26 +28,19 @@ pretty.install()
traceback.install() traceback.install()
ee.Initialize(project="ee-tobias-hoelzer") ee.Initialize(project="ee-tobias-hoelzer")
DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
EMBEDDINGS_DIR = DATA_DIR / "embeddings"
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
cli = cyclopts.App(name="alpha-earth") cli = cyclopts.App(name="alpha-earth")
@cli.command() @cli.command()
def download(grid: Literal["hex", "healpix"], level: int, backup_intermediate: bool = False): def download(grid: Literal["hex", "healpix"], level: int):
"""Extract satellite embeddings from Google Earth Engine and map them to a grid. """Extract satellite embeddings from Google Earth Engine and map them to a grid.
Args: Args:
grid (Literal["hex", "healpix"]): The grid type to use. grid (Literal["hex", "healpix"]): The grid type to use.
level (int): The grid level to use. level (int): The grid level to use.
backup_intermediate (bool, optional): Whether to backup intermediate results. Defaults to False.
""" """
gridname = f"permafrost_{grid}{level}" grid_gdf = grids.open(grid, level)
grid = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")
for year in track(range(2017, 2025), total=8, description="Processing years..."): for year in track(range(2017, 2025), total=8, description="Processing years..."):
embedding_collection = ee.ImageCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL") embedding_collection = ee.ImageCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL")
@ -72,9 +67,9 @@ def download(grid: Literal["hex", "healpix"], level: int, backup_intermediate: b
# Process grid in batches of 100 # Process grid in batches of 100
batch_size = 100 batch_size = 100
all_results = [] all_results = []
n_batches = len(grid) // batch_size n_batches = len(grid_gdf) // batch_size
for batch_num, batch_grid in track( for batch_num, batch_grid in track(
enumerate(np.array_split(grid, n_batches)), enumerate(np.array_split(grid_gdf, n_batches)),
description="Processing batches...", description="Processing batches...",
total=n_batches, total=n_batches,
): ):
@ -88,18 +83,12 @@ def download(grid: Literal["hex", "healpix"], level: int, backup_intermediate: b
# Store batch results # Store batch results
all_results.append(df_batch) all_results.append(df_batch)
# Save batch immediately to disk as backup
if backup_intermediate:
batch_filename = f"{gridname}_embeddings-{year}_batch{batch_num:06d}.parquet"
batch_result = batch_grid.merge(df_batch[[*bands, "cell_id"]], on="cell_id", how="left")
batch_result.to_parquet(EMBEDDINGS_DIR / f"{batch_filename}")
# Combine all batch results # Combine all batch results
df = pd.concat(all_results, ignore_index=True) df = pd.concat(all_results, ignore_index=True)
embeddings_on_grid = grid.merge(df[[*bands, "cell_id"]], on="cell_id", how="left") embeddings_on_grid = grid.merge(df[[*bands, "cell_id"]], on="cell_id", how="left")
embeddings_file = EMBEDDINGS_DIR / f"{gridname}_embeddings-{year}.parquet" embeddings_file = get_annual_embeddings_file(grid, level, year)
embeddings_on_grid.to_parquet(embeddings_file) embeddings_on_grid.to_parquet(embeddings_file)
print(f"Saved embeddings for year {year} to {embeddings_file.resolve()}.") print(f"Saved embeddings for year {year} to {embeddings_file}.")
@cli.command() @cli.command()
@ -111,36 +100,38 @@ def combine_to_zarr(grid: Literal["hex", "healpix"], level: int):
level (int): The grid level to use. level (int): The grid level to use.
""" """
embs = gpd.read_parquet(DATA_DIR / "embeddings" / f"permafrost_{grid}{level}_embeddings-2017.parquet") cell_ids = grids.get_cell_ids(grid, level)
# ? Converting cell IDs from hex strings to integers for xdggs compatibility
cells = [int(cid, 16) for cid in embs.cell_id.to_list()]
years = list(range(2017, 2025)) years = list(range(2017, 2025))
aggs = ["median", "stdDev", "min", "max", "mean", "p1", "p5", "p25", "p75", "p95", "p99"] aggs = ["median", "stdDev", "min", "max", "mean", "p1", "p5", "p25", "p75", "p95", "p99"]
bands = [f"A{str(i).zfill(2)}" for i in range(64)] bands = [f"A{str(i).zfill(2)}" for i in range(64)]
a = xr.DataArray( a = xr.DataArray(
np.nan, np.nan,
dims=("year", "cell", "band", "agg"), dims=("year", "cell_ids", "band", "agg"),
coords={"year": years, "cell": cells, "band": bands, "agg": aggs}, coords={"year": years, "cell_ids": cell_ids, "band": bands, "agg": aggs},
) ).astype(np.float32)
# ? These attributes are needed for xdggs # ? These attributes are needed for xdggs
a.cell.attrs = { a.cell_ids.attrs = {
"grid_name": "h3" if grid == "hex" else "healpix", "grid_name": "h3" if grid == "hex" else "healpix",
"level": level, "level": level,
} }
if grid == "healpix": if grid == "healpix":
a.cell.attrs["indexing_scheme"] = "nested" a.cell_ids.attrs["indexing_scheme"] = "nested"
for year in track(years, total=len(years), description="Processing years..."): for year in track(years, total=len(years), description="Processing years..."):
embs = gpd.read_parquet(DATA_DIR / "embeddings" / f"permafrost_{grid}{level}_embeddings-{year}.parquet") embeddings_file = get_annual_embeddings_file(grid, level, year)
embs = gpd.read_parquet(embeddings_file)
for band in bands: for band in bands:
for agg in aggs: for agg in aggs:
col = f"{band}_{agg}" col = f"{band}_{agg}"
a.loc[{"band": band, "agg": agg, "year": year}] = embs[col].to_list() a.loc[{"band": band, "agg": agg, "year": year}] = embs[col].to_list()
zarr_path = EMBEDDINGS_DIR / f"permafrost_{grid}{level}_embeddings.zarr" a = xdggs.decode(a)
a.to_zarr(zarr_path, consolidated=False, mode="w")
print(f"Saved combined embeddings to {zarr_path.resolve()}.") zarr_path = get_embeddings_store(grid, level)
a.to_zarr(zarr_path, consolidated=False, mode="w", encoding=codecs.from_ds(a))
print(f"Saved combined embeddings to {zarr_path}.")
def main(): # noqa: D103 def main(): # noqa: D103

31
src/entropice/codecs.py Normal file
View file

@ -0,0 +1,31 @@
"""Encoding utilities for zarr dataset storage."""
import xarray as xr
from zarr.codecs import BloscCodec
def from_ds(ds: xr.Dataset, store_floats_as_float32: bool = True, include_coords: bool = True) -> dict:
"""Create compression encoding for zarr dataset storage.
Creates Blosc compression configuration for all data variables and coordinates
in the dataset using zstd compression with level 5.
Args:
ds (xr.Dataset): The xarray Dataset to create encoding for.
store_floats_as_float32 (bool, optional): Whether to store floating point data as float32.
Defaults to True.
include_coords (bool, optional): Whether to include coordinates in the encoding.
This is useful when appending to an existing store.
Defaults to True.
Returns:
dict: Encoding dictionary with compression settings for each variable.
"""
var_names = [*ds.data_vars, *ds.coords] if include_coords else ds.data_vars
encoding = {var: {"compressors": BloscCodec(cname="zstd", clevel=5)} for var in var_names}
if store_floats_as_float32:
for var in ds.data_vars:
if ds[var].dtype == "float64":
encoding[var]["dtype"] = "float32"
return encoding

72
src/entropice/darts.py Normal file
View file

@ -0,0 +1,72 @@
"""Labels of Retrogressive-Thaw-Slumps (RTS).
Assumes that the level 1 and level 2 DARTS features have been downloaded into $DATA_DIR / entropyc-rts / darts: https://arcticdata.io/catalog/view/doi:10.18739/A22B8VD7C
Author: Tobias Hölzer
Date: October 2025
"""
from typing import Literal
import cyclopts
import geopandas as gpd
from rich import pretty, print, traceback
from rich.progress import track
from stopuhr import stopwatch
from entropice import grids
from entropice.paths import dartsl2_cov_file, dartsl2_file, get_darts_rts_file
traceback.install()
pretty.install()
def extract_darts_rts(grid: Literal["hex", "healpix"], level: int):
"""Extract RTS labels from DARTS dataset.
Args:
grid (Literal["hex", "healpix"]): The grid type to use.
level (int): The grid level to use.
"""
with stopwatch("Load data"):
darts_l2 = gpd.read_parquet(dartsl2_file)
darts_cov_l2 = gpd.read_parquet(dartsl2_cov_file)
grid_gdf = grids.open(grid, level)
with stopwatch("Extract RTS labels"):
grid_l2 = grid_gdf.overlay(darts_l2.to_crs(grid_gdf.crs), how="intersection")
grid_cov_l2 = grid_gdf.overlay(darts_cov_l2.to_crs(grid_gdf.crs), how="intersection")
years = list(grid_cov_l2["year"].unique())
for year in track(years, total=len(years), description="Processing years..."):
with stopwatch("Processing RTS", log=False):
subset = grid_l2[grid_l2["year"] == year]
subset_cov = grid_cov_l2[grid_cov_l2["year"] == year]
counts = subset.groupby("cell_id").size()
grid_gdf[f"darts_{year}_rts_count"] = grid_gdf.cell_id.map(counts)
areas = subset.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
grid_gdf[f"darts_{year}_rts_area"] = grid_gdf.cell_id.map(areas)
areas_cov = subset_cov.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
grid_gdf[f"darts_{year}_covered_area"] = grid_gdf.cell_id.map(areas_cov)
grid_gdf[f"darts_{year}_coverage"] = grid_gdf[f"darts_{year}_covered_area"] / grid_gdf.geometry.area
grid_gdf[f"darts_{year}_rts_density"] = (
grid_gdf[f"darts_{year}_rts_area"] / grid_gdf[f"darts_{year}_covered_area"]
)
output_path = get_darts_rts_file(grid, level)
grid_gdf.to_parquet(output_path)
print(f"Saved RTS labels to {output_path}")
stopwatch.summary()
def main(): # noqa: D103
cyclopts.run(extract_darts_rts)
if __name__ == "__main__":
main()

View file

@ -1,3 +1,4 @@
# ruff: noqa: PD011
"""Download and preprocess ERA5 data. """Download and preprocess ERA5 data.
Variables of Interest: Variables of Interest:
@ -9,6 +10,12 @@ Variables of Interest:
- Surface sensible heat flux (sshf) [accum] - Surface sensible heat flux (sshf) [accum]
- Lake ice bottom temperature (lblt) [instant] - Lake ice bottom temperature (lblt) [instant]
Snow Fall and Total precipitation are both further accumulated over a day -
thus instead of taking the daily sum, only the last value of the day is taken.
This is 00:00 of the next day, e.g.:
tp_2020_06_23 <- tp_2020_06_24_00:00
See: https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation#ERA5Land:datadocumentation-accumulationsAccumulations
Naming patterns: Naming patterns:
- Instant Variables are downloaded already as statistically aggregated (lossy), - Instant Variables are downloaded already as statistically aggregated (lossy),
therefore their names get the aggregation as suffix therefore their names get the aggregation as suffix
@ -66,14 +73,12 @@ Author: Tobias Hölzer
Date: June to October 2025 Date: June to October 2025
""" """
import os import cProfile
import time import time
from pathlib import Path
from typing import Literal from typing import Literal
import cyclopts import cyclopts
import dask.distributed as dd import dask.distributed as dd
import geopandas as gpd
import numpy as np import numpy as np
import odc.geo import odc.geo
import odc.geo.xr import odc.geo.xr
@ -84,32 +89,16 @@ import xarray as xr
from rich import pretty, print, traceback from rich import pretty, print, traceback
from rich.progress import track from rich.progress import track
from shapely.geometry import LineString, Polygon from shapely.geometry import LineString, Polygon
from zarr.codecs import BloscCodec from stopuhr import stopwatch
traceback.install(show_locals=True, suppress=[cyclopts, xr, pd]) from entropice import codecs, grids, watermask
from entropice.paths import get_era5_stores
traceback.install(show_locals=True, suppress=[cyclopts, xr, pd, cProfile])
pretty.install() pretty.install()
cli = cyclopts.App() cli = cyclopts.App()
DATA_DIR = Path(os.environ.get("DATA_DIR", "data")) / "entropyc-rts"
ERA5_DIR = DATA_DIR / "era5"
DAILY_RAW_PATH = ERA5_DIR / "daily_raw.zarr"
DAILY_ENRICHED_PATH = ERA5_DIR / "daily_enriched.zarr"
MONTHLY_RAW_PATH = ERA5_DIR / "monthly_raw.zarr"
YEARLY_RAW_PATH = ERA5_DIR / "yearly_aligned.zarr"
SUMMER_RAW_PATH = ERA5_DIR / "summer_aligned.zarr"
WINTER_RAW_PATH = ERA5_DIR / "winter_aligned.zarr"
def _get_grid_paths(
agg: Literal["daily", "monthly", "summer", "winter", "yearly"],
grid: Literal["hex", "healpix"],
level: int,
):
gridname = f"permafrost_{grid}{level}"
aligned_path = ERA5_DIR / f"{agg}_{gridname}.zarr"
return aligned_path
min_lat = 50 min_lat = 50
max_lat = 83.7 # Ensures the right Chunks Size (90 - 64 / 10 + 0.1) max_lat = 83.7 # Ensures the right Chunks Size (90 - 64 / 10 + 0.1)
@ -145,24 +134,6 @@ accums = {
} }
def create_encoding(ds: xr.Dataset):
"""Create compression encoding for zarr dataset storage.
Creates Blosc compression configuration for all data variables and coordinates
in the dataset using zstd compression with level 5.
Args:
ds (xr.Dataset): The xarray Dataset to create encoding for.
Returns:
dict: Encoding dictionary with compression settings for each variable.
"""
# encoding = {var: {"compressors": BloscCodec(cname="zlib", clevel=9)} for var in ds.data_vars}
encoding = {var: {"compressors": BloscCodec(cname="zstd", clevel=5)} for var in [*ds.data_vars, *ds.coords]}
return encoding
# ================ # ================
# === Download === # === Download ===
# ================ # ================
@ -208,16 +179,18 @@ def download_daily_aggregated():
daily_raw = xr.merge( daily_raw = xr.merge(
[ [
# Instant # Instant
era5.t2m.resample(time="1D").max().rename("t2m_max"), era5.t2m.resample(time="1D").max().rename("t2m_max").astype(np.float32),
era5.t2m.resample(time="1D").min().rename("t2m_min"), era5.t2m.resample(time="1D").min().rename("t2m_min").astype(np.float32),
era5.t2m.resample(time="1D").mean().rename("t2m_mean"), era5.t2m.resample(time="1D").mean().rename("t2m_mean").astype(np.float32),
era5.snowc.resample(time="1D").mean().rename("snowc_mean"), era5.snowc.resample(time="1D").mean().rename("snowc_mean").astype(np.float32),
era5.sde.resample(time="1D").mean().rename("sde_mean"), era5.sde.resample(time="1D").mean().rename("sde_mean").astype(np.float32),
era5.lblt.resample(time="1D").max().rename("lblt_max"), era5.lblt.resample(time="1D").max().rename("lblt_max").astype(np.float32),
# Accum # Accum
era5.tp.resample(time="1D").sum().rename("tp"), era5.sshf.resample(time="1D").sum(skipna=False).rename("sshf").astype(np.float32),
era5.sf.resample(time="1D").sum().rename("sf"), # Precipitation and snow fall are special
era5.sshf.resample(time="1D").sum().rename("sshf"), # Take only the last value of the day (00:00 of next day)
era5.tp.resample(time="1D").first().shift(time=-1).rename("tp").astype(np.float32),
era5.sf.resample(time="1D").first().shift(time=-1).rename("sf").astype(np.float32),
] ]
) )
@ -225,16 +198,18 @@ def download_daily_aggregated():
daily_raw["t2m_max"].attrs = {"long_name": "Daily maximum 2 metre temperature", "units": "K"} daily_raw["t2m_max"].attrs = {"long_name": "Daily maximum 2 metre temperature", "units": "K"}
daily_raw["t2m_min"].attrs = {"long_name": "Daily minimum 2 metre temperature", "units": "K"} daily_raw["t2m_min"].attrs = {"long_name": "Daily minimum 2 metre temperature", "units": "K"}
daily_raw["t2m_mean"].attrs = {"long_name": "Daily mean 2 metre temperature", "units": "K"} daily_raw["t2m_mean"].attrs = {"long_name": "Daily mean 2 metre temperature", "units": "K"}
daily_raw["tp"].attrs = {"long_name": "Daily total precipitation", "units": "m"}
daily_raw["sf"].attrs = {"long_name": "Daily total snow fall", "units": "m"}
daily_raw["snowc_mean"].attrs = {"long_name": "Daily mean snow cover", "units": "m"} daily_raw["snowc_mean"].attrs = {"long_name": "Daily mean snow cover", "units": "m"}
daily_raw["sde_mean"].attrs = {"long_name": "Daily mean snow depth", "units": "m"} daily_raw["sde_mean"].attrs = {"long_name": "Daily mean snow depth", "units": "m"}
daily_raw["sshf"].attrs = {"long_name": "Daily total surface sensible heat flux", "units": "J/m²"}
daily_raw["lblt_max"].attrs = {"long_name": "Daily maximum lake ice bottom temperature", "units": "K"} daily_raw["lblt_max"].attrs = {"long_name": "Daily maximum lake ice bottom temperature", "units": "K"}
daily_raw["tp"].attrs = {"long_name": "Daily total precipitation", "units": "m"} # Units are rather m^3 / m^2
daily_raw["sf"].attrs = {"long_name": "Daily total snow fall", "units": "m"} # Units are rather m^3 / m^2
daily_raw["sshf"].attrs = {"long_name": "Daily total surface sensible heat flux", "units": "J/m²"}
daily_raw = daily_raw.odc.assign_crs("epsg:4326") daily_raw = daily_raw.odc.assign_crs("epsg:4326")
daily_raw = daily_raw.drop_vars(["surface", "number", "depthBelowLandLayer"]) daily_raw = daily_raw.drop_vars(["surface", "number", "depthBelowLandLayer"])
daily_raw.to_zarr(DAILY_RAW_PATH, mode="w", encoding=create_encoding(daily_raw), consolidated=False) daily_store = get_era5_stores("daily")
print(f"Saving downloaded and daily aggregated ERA5 data to {daily_store}.")
daily_raw.to_zarr(daily_store, mode="w", encoding=codecs.from_ds(daily_raw), consolidated=False)
@cli.command @cli.command
@ -252,7 +227,7 @@ def download():
print(client) print(client)
print(client.dashboard_link) print(client.dashboard_link)
download_daily_aggregated() download_daily_aggregated()
print(f"Downloaded and aggregated ERA5 data to {DAILY_RAW_PATH.resolve()}.") print("Downloaded and aggregated ERA5 data.")
# ============================ # ============================
@ -275,36 +250,62 @@ def daily_enrich():
- Snow isolation index - Snow isolation index
""" """
daily = xr.open_zarr(DAILY_RAW_PATH, consolidated=False).set_coords("spatial_ref") daily_store = get_era5_stores("daily")
daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}" assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
# For better dask performance, all variables are written immediately after calculation
# The smart scheduling which could performantly handle more is according to docs not yet implemented.
# See https://docs.xarray.dev/en/stable/user-guide/dask.html#best-practices point 3 or https://github.com/dask/dask/issues/874
def _store(v: str):
nonlocal daily
encoding = codecs.from_ds(daily[[v]], include_coords=False)
print(f"Storing enriched daily variable {v} to {daily_store}...")
with stopwatch("Storing enriched daily variable"):
daily[[v]].to_zarr(daily_store, mode="a", encoding=encoding, consolidated=False)
daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
# Formulas based on Groeke et. al. (2025) Stochastic Weather generation... # Formulas based on Groeke et. al. (2025) Stochastic Weather generation...
daily["t2m_avg"] = (daily.t2m_max + daily.t2m_min) / 2 daily["t2m_avg"] = (daily.t2m_max + daily.t2m_min) / 2
daily.t2m_avg.attrs = {"long_name": "Daily average 2 metre temperature", "units": "K"} daily.t2m_avg.attrs = {"long_name": "Daily average 2 metre temperature", "units": "K"}
_store("t2m_avg")
daily["t2m_range"] = daily.t2m_max - daily.t2m_min daily["t2m_range"] = daily.t2m_max - daily.t2m_min
daily.t2m_range.attrs = {"long_name": "Daily range of 2 metre temperature", "units": "K"} daily.t2m_range.attrs = {"long_name": "Daily range of 2 metre temperature", "units": "K"}
_store("t2m_range")
daily["t2m_skew"] = (daily.t2m_mean - daily.t2m_min) / daily.t2m_range daily["t2m_skew"] = (daily.t2m_mean - daily.t2m_min) / daily.t2m_range
daily.t2m_skew.attrs = {"long_name": "Daily skewness of 2 metre temperature"} daily.t2m_skew.attrs = {"long_name": "Daily skewness of 2 metre temperature"}
_store("t2m_skew")
daily["thawing_degree_days"] = (daily.t2m_avg - 273.15).clip(min=0) daily["thawing_degree_days"] = (daily.t2m_avg - 273.15).clip(min=0)
daily.thawing_degree_days.attrs = {"long_name": "Thawing degree days", "units": "K"} daily.thawing_degree_days.attrs = {"long_name": "Thawing degree days", "units": "K"}
_store("thawing_degree_days")
daily["freezing_degree_days"] = (273.15 - daily.t2m_avg).clip(min=0) daily["freezing_degree_days"] = (273.15 - daily.t2m_avg).clip(min=0)
daily.freezing_degree_days.attrs = {"long_name": "Freezing degree days", "units": "K"} daily.freezing_degree_days.attrs = {"long_name": "Freezing degree days", "units": "K"}
_store("freezing_degree_days")
daily["thawing_days"] = (daily.t2m_avg > 273.15).astype(int) daily["thawing_days"] = (daily.t2m_avg > 273.15).astype(int)
daily.thawing_days.attrs = {"long_name": "Thawing days"} daily.thawing_days.attrs = {"long_name": "Thawing days"}
_store("thawing_days")
daily["freezing_days"] = (daily.t2m_avg < 273.15).astype(int) daily["freezing_days"] = (daily.t2m_avg < 273.15).astype(int)
daily.freezing_days.attrs = {"long_name": "Freezing days"} daily.freezing_days.attrs = {"long_name": "Freezing days"}
_store("freezing_days")
daily["precipitation_occurrences"] = (daily.tp > 0).astype(int) daily["precipitation_occurrences"] = (daily.tp > 0.001).astype(int)
daily.precipitation_occurrences.attrs = {"long_name": "Precipitation occurrences"} daily.precipitation_occurrences.attrs = {"long_name": "Precipitation occurrences"}
daily["snowfall_occurrences"] = (daily.sf > 0).astype(int) _store("precipitation_occurrences")
daily["snowfall_occurrences"] = (daily.sf > 0.001).astype(int)
daily.snowfall_occurrences.attrs = {"long_name": "Snowfall occurrences"} daily.snowfall_occurrences.attrs = {"long_name": "Snowfall occurrences"}
_store("snowfall_occurrences")
daily["naive_snow_isolation"] = daily.snowc_mean * daily.sde_mean daily["naive_snow_isolation"] = daily.snowc_mean * daily.sde_mean
daily.naive_snow_isolation.attrs = {"long_name": "Naive snow isolation"} daily.naive_snow_isolation.attrs = {"long_name": "Naive snow isolation"}
_store("naive_snow_isolation")
daily.to_zarr(DAILY_ENRICHED_PATH, mode="w", encoding=create_encoding(daily), consolidated=False)
def monthly_aggregate(): def monthly_aggregate():
@ -318,7 +319,8 @@ def monthly_aggregate():
The aggregated monthly data is saved to a zarr file for further processing. The aggregated monthly data is saved to a zarr file for further processing.
""" """
daily = xr.open_zarr(DAILY_ENRICHED_PATH, consolidated=False) daily_store = get_era5_stores("daily")
daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}" assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
daily = daily.sel(time=slice(min_time, max_time)) daily = daily.sel(time=slice(min_time, max_time))
@ -348,8 +350,10 @@ def monthly_aggregate():
monthly_accums.append(agg) monthly_accums.append(agg)
monthly = xr.merge(monthly_instants + monthly_accums) monthly = xr.merge(monthly_instants + monthly_accums)
monthly = monthly.chunk({"time": len(monthly.time), "latitude": 64, "longitude": 64}) monthly = monthly.chunk({"time": len(monthly.time), "latitude": 256, "longitude": 256}) # ~ 100Mb / chunk for f32
monthly.to_zarr(MONTHLY_RAW_PATH, mode="w", encoding=create_encoding(monthly), consolidated=False) monthly_store = get_era5_stores("monthly")
print(f"Saving monthly aggregated ERA5 data to {monthly_store}.")
monthly.to_zarr(monthly_store, mode="w", encoding=codecs.from_ds(monthly), consolidated=False)
def multi_monthly_aggregate(monthly: xr.Dataset, n: int = 12) -> xr.Dataset: def multi_monthly_aggregate(monthly: xr.Dataset, n: int = 12) -> xr.Dataset:
@ -407,47 +411,12 @@ def multi_monthly_aggregate(monthly: xr.Dataset, n: int = 12) -> xr.Dataset:
"link": "https://tc.copernicus.org/articles/11/989/2017/tc-11-989-2017.pdf", "link": "https://tc.copernicus.org/articles/11/989/2017/tc-11-989-2017.pdf",
} }
multimonthly = multimonthly.chunk({"time": len(multimonthly.time), "latitude": 64, "longitude": 64}) multimonthly = multimonthly.chunk(
{"time": len(multimonthly.time), "latitude": 128, "longitude": 1024}
) # ~36Mb / chunk for f64
return multimonthly return multimonthly
def yearly_and_seasonal_aggregate():
"""Aggregate monthly ERA5 data to yearly resolution with seasonal splits.
Takes monthly aggregated data and creates yearly aggregates using a shifted
calendar (October to September) to better capture Arctic seasonal patterns.
Creates separate aggregates for full year, winter (Oct-Apr), and summer
(May-Sep) periods.
The first and last incomplete years are excluded from the analysis.
Winter months are defined as months 1-7 in the shifted calendar,
and summer months are 8-12.
The final dataset includes yearly, winter, and summer aggregates for all
climate variables, saved to a zarr file.
"""
monthly = xr.open_zarr(MONTHLY_RAW_PATH, consolidated=False).set_coords("spatial_ref")
assert "time" in monthly.dims, f"Expected dim 'time' to be in {monthly.dims=}"
# "Shift" the calendar by slicing the first Jan-Sep and the last Oct-Dec months
first_year = monthly.time.dt.year.min().item()
last_year = monthly.time.dt.year.max().item()
monthly = monthly.sel(time=slice(f"{first_year}-10-01", f"{last_year}-09-30"))
yearly = multi_monthly_aggregate(monthly, n=12)
yearly = derive_yearly_variables(yearly)
yearly.to_zarr(YEARLY_RAW_PATH, mode="w", encoding=create_encoding(yearly), consolidated=False)
summer_winter = multi_monthly_aggregate(monthly, n=6)
summer = summer_winter.sel(time=summer_winter.time.dt.month == 4)
summer.to_zarr(SUMMER_RAW_PATH, mode="w", encoding=create_encoding(summer), consolidated=False)
winter = summer_winter.sel(time=summer_winter.time.dt.month == 10)
winter.to_zarr(WINTER_RAW_PATH, mode="w", encoding=create_encoding(winter), consolidated=False)
def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset: def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
"""Derive additional variables from daily data and add them to the yearly dataset. """Derive additional variables from daily data and add them to the yearly dataset.
@ -459,7 +428,8 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
""" """
assert "time" in yearly.dims, f"Expected dim 'time' to be in {yearly.dims=}" assert "time" in yearly.dims, f"Expected dim 'time' to be in {yearly.dims=}"
daily = xr.open_zarr(DAILY_ENRICHED_PATH, consolidated=False).set_coords("spatial_ref") daily_store = get_era5_stores("daily")
daily = xr.open_zarr(daily_store, consolidated=False).set_coords("spatial_ref")
assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}" assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
daily = daily.sel(time=slice(min_time, max_time)) daily = daily.sel(time=slice(min_time, max_time))
# ? Note: The functions do not really account for leap years # ? Note: The functions do not really account for leap years
@ -476,7 +446,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
# This assumes that the 01-01 is almost everywhere one of the coldest days in the year # This assumes that the 01-01 is almost everywhere one of the coldest days in the year
first_thaw_day = daily.thawing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1 first_thaw_day = daily.thawing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
first_thaw_day = first_thaw_day.where(~never_thaws).rename("day_of_first_thaw").rename(year="time") first_thaw_day = first_thaw_day.where(~never_thaws).rename("day_of_first_thaw").rename(year="time")
first_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_thaw_day.time.values]) # noqa: PD011 first_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_thaw_day.time.values])
first_thaw_day.attrs = {"long_name": "Day of first thaw in year", "units": "day of year"} first_thaw_day.attrs = {"long_name": "Day of first thaw in year", "units": "day of year"}
yearly["day_of_first_thaw"] = first_thaw_day.sel(time=yearly.time) yearly["day_of_first_thaw"] = first_thaw_day.sel(time=yearly.time)
@ -484,7 +454,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
n_days_in_year - daily.thawing_days[::-1].groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1 n_days_in_year - daily.thawing_days[::-1].groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
) )
last_thaw_day = last_thaw_day.where(~never_thaws).rename("day_of_last_thaw").rename(year="time") last_thaw_day = last_thaw_day.where(~never_thaws).rename("day_of_last_thaw").rename(year="time")
last_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_thaw_day.time.values]) # noqa: PD011 last_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_thaw_day.time.values])
last_thaw_day.attrs = {"long_name": "Day of last thaw in year", "units": "day of year"} last_thaw_day.attrs = {"long_name": "Day of last thaw in year", "units": "day of year"}
yearly["day_of_last_thaw"] = last_thaw_day.sel(time=yearly.time) yearly["day_of_last_thaw"] = last_thaw_day.sel(time=yearly.time)
@ -506,7 +476,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
first_freezing_day = daily_shifted.freezing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1 first_freezing_day = daily_shifted.freezing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
first_freezing_day = first_freezing_day.where(~never_freezes).rename("day_of_first_freeze").rename(year="time") first_freezing_day = first_freezing_day.where(~never_freezes).rename("day_of_first_freeze").rename(year="time")
first_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_freezing_day.time.values]) # noqa: PD011 first_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_freezing_day.time.values])
first_freezing_day.attrs = {"long_name": "Day of first freeze in year", "units": "day of year"} first_freezing_day.attrs = {"long_name": "Day of first freeze in year", "units": "day of year"}
yearly["day_of_first_freeze"] = first_freezing_day.sel(time=yearly.time) yearly["day_of_first_freeze"] = first_freezing_day.sel(time=yearly.time)
@ -516,7 +486,7 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
+ 1 + 1
) )
last_freezing_day = last_freezing_day.where(~never_freezes).rename("day_of_last_freeze").rename(year="time") last_freezing_day = last_freezing_day.where(~never_freezes).rename("day_of_last_freeze").rename(year="time")
last_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_freezing_day.time.values]) # noqa: PD011 last_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_freezing_day.time.values])
last_freezing_day.attrs = {"long_name": "Day of last freeze in year", "units": "day of year"} last_freezing_day.attrs = {"long_name": "Day of last freeze in year", "units": "day of year"}
yearly["day_of_last_freeze"] = last_freezing_day.sel(time=yearly.time) yearly["day_of_last_freeze"] = last_freezing_day.sel(time=yearly.time)
@ -528,6 +498,53 @@ def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
return yearly return yearly
def yearly_and_seasonal_aggregate():
"""Aggregate monthly ERA5 data to yearly resolution with seasonal splits.
Takes monthly aggregated data and creates yearly aggregates using a shifted
calendar (October to September) to better capture Arctic seasonal patterns.
Creates separate aggregates for full year, winter (Oct-Apr), and summer
(May-Sep) periods.
The first and last incomplete years are excluded from the analysis.
Winter months are defined as months 1-7 in the shifted calendar,
and summer months are 8-12.
The final dataset includes yearly, winter, and summer aggregates for all
climate variables, saved to a zarr file.
"""
monthly_store = get_era5_stores("monthly")
monthly = xr.open_zarr(monthly_store, consolidated=False).set_coords("spatial_ref")
assert "time" in monthly.dims, f"Expected dim 'time' to be in {monthly.dims=}"
# "Shift" the calendar by slicing the first Jan-Sep and the last Oct-Dec months
first_year = monthly.time.dt.year.min().item()
last_year = monthly.time.dt.year.max().item()
monthly = monthly.sel(time=slice(f"{first_year}-10-01", f"{last_year}-09-30"))
yearly = multi_monthly_aggregate(monthly, n=12)
yearly = derive_yearly_variables(yearly)
yearly = yearly.chunk({"time": len(yearly.time), "latitude": 256, "longitude": 1024}) # ~36Mb / chunk for f32
yearly_store = get_era5_stores("yearly")
print(f"Saving yearly aggregated ERA5 data to {yearly_store}.")
yearly.to_zarr(yearly_store, mode="w", encoding=codecs.from_ds(yearly), consolidated=False)
summer_winter = multi_monthly_aggregate(monthly, n=6)
summer = summer_winter.sel(time=summer_winter.time.dt.month == 4)
summer = summer.chunk({"time": len(summer.time), "latitude": 256, "longitude": 1024}) # ~36Mb / chunk for f32
summer_store = get_era5_stores("summer")
print(f"Saving summer aggregated ERA5 data to {summer_store}.")
summer.to_zarr(summer_store, mode="w", encoding=codecs.from_ds(summer), consolidated=False)
winter = summer_winter.sel(time=summer_winter.time.dt.month == 10)
winter = winter.chunk({"time": len(winter.time), "latitude": 256, "longitude": 1024}) # ~36Mb / chunk for f32
winter_store = get_era5_stores("winter")
print(f"Saving winter aggregated ERA5 data to {winter_store}.")
winter.to_zarr(winter_store, mode="w", encoding=codecs.from_ds(winter), consolidated=False)
@cli.command @cli.command
def enrich(n_workers: int = 10, monthly: bool = True, yearly: bool = True, daily: bool = True): def enrich(n_workers: int = 10, monthly: bool = True, yearly: bool = True, daily: bool = True):
"""Enrich data and pPerform temporal aggregation of ERA5 data using Dask cluster. """Enrich data and pPerform temporal aggregation of ERA5 data using Dask cluster.
@ -591,15 +608,16 @@ def _check_geom(geobox: odc.geo.geobox.GeoBox, geom: odc.geo.Geometry) -> bool:
return (roix.stop - roix.start) > 1 and (roiy.stop - roiy.start) > 1 return (roix.stop - roix.start) > 1 and (roiy.stop - roiy.start) > 1
def extract_cell_data(yearly: xr.Dataset, geom: Polygon): @stopwatch("Getting corrected geometries", log=False)
"""Extract ERA5 data for a specific grid cell geometry. def _get_corrected_geoms(geom: Polygon, gbox: odc.geo.geobox.GeoBox) -> list[odc.geo.Geometry]:
"""Get corrected geometries for antimeridian-crossing polygons.
Extracts and spatially averages ERA5 data within the bounds of a grid cell.
Handles antimeridian-crossing cells by splitting them appropriately.
Args: Args:
yearly (xr.Dataset): Yearly aggregated ERA5 dataset. geom (Polygon): Input polygon geometry.
geom (Polygon): Polygon geometry of the grid cell. gbox (odc.geo.geobox.GeoBox): GeoBox for spatial reference.
Returns:
list[odc.geo.Geometry]: List of corrected, georeferenced geometries.
""" """
# cell.geometry is a shapely Polygon # cell.geometry is a shapely Polygon
@ -608,28 +626,73 @@ def extract_cell_data(yearly: xr.Dataset, geom: Polygon):
# Split geometry in case it crossed antimeridian # Split geometry in case it crossed antimeridian
else: else:
geoms = _split_antimeridian_cell(geom) geoms = _split_antimeridian_cell(geom)
cell_data = []
for geom in geoms: geoms = [odc.geo.Geometry(g, crs="epsg:4326") for g in geoms]
geom = odc.geo.Geometry(geom, crs="epsg:4326") geoms = filter(lambda g: _check_geom(gbox, g), geoms)
if not _check_geom(yearly.odc.geobox, geom): return geoms
continue
cell_data.append(yearly.odc.crop(geom).drop_vars("spatial_ref").mean(["latitude", "longitude"]))
if len(cell_data) == 0: @stopwatch("Extracting cell data", log=False)
def extract_cell_data(ds: xr.Dataset, geoms: list[odc.geo.Geometry]) -> xr.Dataset | bool:
"""Extract ERA5 data for a specific grid cell geometry.
Extracts and spatially averages ERA5 data within the bounds of a grid cell.
Handles antimeridian-crossing cells by splitting them appropriately.
Args:
ds (xr.Dataset): An ERA5 dataset.
geoms (list[odc.geo.Geometry]): List of (valid) geometries of the grid cell.
"""
if len(geoms) == 0:
return False return False
elif len(cell_data) == 1: elif len(geoms) == 1:
cell_data = cell_data[0] return ds.odc.crop(geoms[0]).drop_vars("spatial_ref").mean(["latitude", "longitude"], skipna=True).compute()
else: else:
cell_data = xr.concat(cell_data, dim="part").mean("part") parts = [
cell_data = cell_data.compute() ds.odc.crop(geom).drop_vars("spatial_ref").mean(["latitude", "longitude"], skipna=True) for geom in geoms
return cell_data ]
parts = [part for part in parts if part.latitude.size > 0 and part.longitude.size > 0]
if len(parts) == 0:
raise ValueError("No valid parts found for geometry. This should not happen!")
elif len(parts) == 1:
return parts[0].compute()
else:
return xr.concat(parts, dim="part").mean("part", skipna=True).compute()
def _correct_longs(ds: xr.Dataset) -> xr.Dataset:
return ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180)).sortby("longitude")
@stopwatch("Creating aligned dataset", log=False)
def _create_aligned(
ds: xr.Dataset, data: dict[str, np.ndarray], grid: Literal["hex", "healpix"], level: int
) -> xr.Dataset:
cell_ids = grids.get_cell_ids(grid, level)
data_vars = {var: (["cell_ids", "time"], values) for var, values in data.items()}
aligned = xr.Dataset(
data_vars,
coords={"cell_ids": cell_ids, "time": ds.time},
)
gridinfo = {
"grid_name": "h3" if grid == "hex" else grid,
"level": level,
}
if grid == "healpix":
gridinfo["indexing_scheme"] = "nested"
aligned.cell_ids.attrs = gridinfo
for var in ds.data_vars:
aligned[var].attrs = ds[var].attrs
aligned = aligned.chunk({"cell_ids": min(len(aligned.cell_ids), 10000), "time": len(aligned.time)})
return aligned
@cli.command @cli.command
def spatial_agg( def spatial_agg(
grid: Literal["hex", "healpix"], grid: Literal["hex", "healpix"],
level: int, level: int,
agg: Literal["summer", "winter", "yearly"] = "yearly",
n_workers: int = 10,
): ):
"""Perform spatial aggregation of ERA5 data to grid cells. """Perform spatial aggregation of ERA5 data to grid cells.
@ -640,69 +703,73 @@ def spatial_agg(
Args: Args:
grid ("hex" | "healpix"): Grid type. grid ("hex" | "healpix"): Grid type.
level (int): Grid resolution level. level (int): Grid resolution level.
agg ("summer" | "winter" | "yearly"): Type of aggregation to perform. Defaults to yearly.
n_workers (int, optional): Number of parallel workers to use. Defaults to 10.
""" """
gridname = f"permafrost_{grid}{level}" grid_gdf = grids.open(grid, level)
agg_grid_path = _get_grid_paths(agg, grid, level) # ? Mask out water, since we don't want to aggregate over oceans
grid_df = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet") grid_gdf = watermask.clip_grid(grid_gdf)
# Create an empty zarr array with the right dimensions
if agg == "summer":
agg_data_path = SUMMER_RAW_PATH
elif agg == "winter":
agg_data_path = WINTER_RAW_PATH
elif agg == "yearly":
agg_data_path = YEARLY_RAW_PATH
else:
raise ValueError(f"Unknown aggregation type: {agg}")
agg_raw = (
xr.open_zarr(agg_data_path, consolidated=False, decode_timedelta=False)
.set_coords("spatial_ref")
.drop_vars(["surface", "number", "depthBelowLandLayer"])
.load()
)
assert {"latitude", "longitude", "time"} == set(agg_raw.dims), (
f"Expected dims ('latitude', 'longitude', 'time'), got {agg_raw.dims}"
)
assert agg_raw.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {agg_raw.odc.crs}"
# Convert lons to -180 to 180 instead of 0 to 360 summer_unaligned_store = get_era5_stores("summer")
agg_raw = agg_raw.assign_coords(longitude=(((agg_raw.longitude + 180) % 360) - 180)).sortby("longitude") winter_unaligned_store = get_era5_stores("winter")
yearly_unaligned_store = get_era5_stores("yearly")
with stopwatch("Loading summer ERA5 data"):
summer_unaligned = xr.open_zarr(summer_unaligned_store, consolidated=False).set_coords("spatial_ref")
assert {"latitude", "longitude", "time"} == set(summer_unaligned.dims)
assert summer_unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {summer_unaligned.odc.crs}"
summer_unaligned = _correct_longs(summer_unaligned)
with stopwatch("Loading winter ERA5 data"):
winter_unaligned = xr.open_zarr(winter_unaligned_store, consolidated=False).set_coords("spatial_ref")
assert {"latitude", "longitude", "time"} == set(winter_unaligned.dims)
assert winter_unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {winter_unaligned.odc.crs}"
winter_unaligned = _correct_longs(winter_unaligned)
with stopwatch("Loading yearly ERA5 data"):
yearly_unaligned = xr.open_zarr(yearly_unaligned_store, consolidated=False).set_coords("spatial_ref")
assert {"latitude", "longitude", "time"} == set(yearly_unaligned.dims)
assert yearly_unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {yearly_unaligned.odc.crs}"
yearly_unaligned = _correct_longs(yearly_unaligned)
# ? Converting cell IDs from hex strings to integers for xdggs compatibility summer_data = {
cells = [int(cid, 16) for cid in grid_df.cell_id.to_list()] var: np.full((len(grid_gdf), len(summer_unaligned.time)), np.nan, dtype=np.float32)
for var in summer_unaligned.data_vars
agg_aligned = (
xr.zeros_like(agg_raw.isel(latitude=0, longitude=0).drop_vars(["latitude", "longitude"]))
.expand_dims({"cell_ids": cells})
.chunk({"cell_ids": min(len(grid_df), 10000), "time": len(agg_raw.time)})
)
agg_aligned.cell_ids.attrs = {
"grid_name": "h3" if grid == "hex" else grid,
"level": level,
} }
if grid == "healpix": winter_data = {
agg_aligned.cell_ids.attrs["indexing_scheme"] = "nested" var: np.full((len(grid_gdf), len(winter_unaligned.time)), np.nan, dtype=np.float32)
for var in winter_unaligned.data_vars
from stopuhr import stopwatch }
yearly_data = {
for _, row in track( var: np.full((len(grid_gdf), len(yearly_unaligned.time)), np.nan, dtype=np.float32)
grid_df.to_crs("epsg:4326").iterrows(), for var in yearly_unaligned.data_vars
total=len(grid_df), }
for i, (_, row) in track(
enumerate(grid_gdf.to_crs("epsg:4326").iterrows()),
total=len(grid_gdf),
description="Spatially aggregating ERA5 data...", description="Spatially aggregating ERA5 data...",
): ):
cell_id = int(row.cell_id, 16) geoms = _get_corrected_geoms(row.geometry, summer_unaligned.odc.geobox)
with stopwatch("Extracting cell data", log=False): if len(geoms) == 0:
cell_data = extract_cell_data(agg_raw, row.geometry) print(f"Warning: No valid geometry for cell {row.cell_id}.")
if cell_data is False:
print(f"Warning: No data found for cell {cell_id}, skipping.")
continue continue
with stopwatch("Assigning cell data", log=False):
agg_aligned.loc[{"cell_ids": cell_id}] = cell_data
agg_aligned.to_zarr(agg_grid_path, mode="w", consolidated=False, encoding=create_encoding(agg_aligned)) cell_data = extract_cell_data(summer_unaligned, geoms)
for var in summer_unaligned.data_vars:
summer_data[var][i, :] = cell_data[var].values.astype()
cell_data = extract_cell_data(winter_unaligned, geoms)
for var in winter_unaligned.data_vars:
winter_data[var][i, :] = cell_data[var].values
cell_data = extract_cell_data(yearly_unaligned, geoms)
for var in yearly_unaligned.data_vars:
yearly_data[var][i, :] = cell_data[var].values
summer = _create_aligned(summer_unaligned, summer_data, grid, level)
winter = _create_aligned(winter_unaligned, winter_data, grid, level)
yearly = _create_aligned(yearly_unaligned, yearly_data, grid, level)
summer_store = get_era5_stores("summer", grid, level)
winter_store = get_era5_stores("winter", grid, level)
yearly_store = get_era5_stores("yearly", grid, level)
summer.to_zarr(summer_store, mode="w", consolidated=False, encoding=codecs.from_ds(summer))
winter.to_zarr(winter_store, mode="w", consolidated=False, encoding=codecs.from_ds(winter))
yearly.to_zarr(yearly_store, mode="w", consolidated=False, encoding=codecs.from_ds(yearly))
print("Finished spatial matching.") print("Finished spatial matching.")
stopwatch.summary() stopwatch.summary()

View file

@ -4,8 +4,6 @@ Author: Tobias Hölzer
Date: 09. June 2025 Date: 09. June 2025
""" """
import os
from pathlib import Path
from typing import Literal from typing import Literal
import cartopy.crs as ccrs import cartopy.crs as ccrs
@ -25,14 +23,45 @@ from shapely.ops import transform
from stopuhr import stopwatch from stopuhr import stopwatch
from xdggs.healpix import HealpixInfo from xdggs.healpix import HealpixInfo
from entropice.paths import get_grid_file, get_grid_viz_file, watermask_file
traceback.install() traceback.install()
pretty.install() pretty.install()
DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
GRIDS_DIR = DATA_DIR / "grids" def open(grid: Literal["hex", "healpix"], level: int):
FIGURES_DIR = DATA_DIR / "figures" """Open a saved grid from parquet file.
GRIDS_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True) Args:
grid (Literal["hex", "healpix"]): The grid type to use.
level (int): The grid level to use.
Returns:
GeoDataFrame: The loaded grid.
"""
gridfile = get_grid_file(grid, level)
grid = gpd.read_parquet(gridfile)
return grid
def get_cell_ids(grid: Literal["hex", "healpix"], level: int):
"""Get the cell IDs of a saved grid.
Args:
grid (Literal["hex", "healpix"]): The grid type to use.
level (int): The grid level to use.
Returns:
List[str]: The list of cell IDs.
"""
grid_gdf = open(grid, level)
cell_ids = grid_gdf["cell_id"].tolist()
if grid == "hex":
# ? Converting cell IDs from hex strings to integers for xdggs compatibility
cell_ids = [int(cid, 16) for cid in cell_ids]
return cell_ids
@stopwatch("Create a global hex grid") @stopwatch("Create a global hex grid")
@ -131,7 +160,7 @@ def filter_permafrost_grid(grid: gpd.GeoDataFrame):
grid = grid.to_crs("EPSG:3413") grid = grid.to_crs("EPSG:3413")
# Filter out non-land areas (e.g., oceans) # Filter out non-land areas (e.g., oceans)
water_mask = gpd.read_file(DATA_DIR / "simplified-water-polygons-split-3857/simplified_water_polygons.shp") water_mask = gpd.read_file(watermask_file)
water_mask = water_mask.to_crs("EPSG:3413") water_mask = water_mask.to_crs("EPSG:3413")
ov = gpd.overlay(grid, water_mask, how="intersection") ov = gpd.overlay(grid, water_mask, how="intersection")
@ -226,14 +255,14 @@ def cli(grid: Literal["hex", "healpix"], level: int):
print("No valid grid cells found.") print("No valid grid cells found.")
return return
grid_file = GRIDS_DIR / f"permafrost_{grid}{level}_grid.parquet" grid_file = get_grid_file(grid, level)
grid_gdf.to_parquet(grid_file) grid_gdf.to_parquet(grid_file)
print(f"Saved to {grid_file.resolve()}") print(f"Saved to {grid_file}")
fig = vizualize_grid(grid_gdf, grid, level) fig = vizualize_grid(grid_gdf, grid, level)
fig_file = FIGURES_DIR / f"permafrost_{grid}{level}_grid.png" fig_file = get_grid_viz_file(grid, level)
fig.savefig(fig_file, dpi=300) fig.savefig(fig_file, dpi=300)
print(f"Saved figure to {fig_file.resolve()}") print(f"Saved figure to {fig_file}")
plt.close(fig) plt.close(fig)

75
src/entropice/paths.py Normal file
View file

@ -0,0 +1,75 @@
# ruff: noqa: D103
"""Paths for entropice data storage."""
import os
from pathlib import Path
from typing import Literal
DATA_DIR = Path(os.environ.get("DATA_DIR", "data")).resolve() / "entropice"
GRIDS_DIR = DATA_DIR / "grids"
FIGURES_DIR = DATA_DIR / "figures"
DARTS_DIR = DATA_DIR / "darts"
ERA5_DIR = DATA_DIR / "era5"
EMBEDDINGS_DIR = DATA_DIR / "embeddings"
WATERMASK_DIR = DATA_DIR / "watermask"
GRIDS_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
DARTS_DIR.mkdir(parents=True, exist_ok=True)
ERA5_DIR.mkdir(parents=True, exist_ok=True)
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
WATERMASK_DIR.mkdir(parents=True, exist_ok=True)
watermask_file = DATA_DIR / "simplified_water_polygons.shp"
dartsl2_file = DARTS_DIR / "DARTS_NitzeEtAl_v1-2_features_2018-2023_level2.parquet"
dartsl2_cov_file = DARTS_DIR / "DARTS_NitzeEtAl_v1-2_coverage_2018-2023_level2.parquet"
def _get_gridname(grid: Literal["hex", "healpix"], level: int) -> str:
return f"permafrost_{grid}{level}"
def get_grid_file(grid: Literal["hex", "healpix"], level: int) -> Path:
gridname = _get_gridname(grid, level)
gridfile = GRIDS_DIR / f"{gridname}_grid.parquet"
return gridfile
def get_grid_viz_file(grid: Literal["hex", "healpix"], level: int) -> Path:
gridname = _get_gridname(grid, level)
vizfile = FIGURES_DIR / f"{gridname}_grid.png"
return vizfile
def get_darts_rts_file(grid: Literal["hex", "healpix"], level: int) -> Path:
gridname = _get_gridname(grid, level)
rtsfile = DARTS_DIR / f"{gridname}_rts.parquet"
return rtsfile
def get_annual_embeddings_file(grid: Literal["hex", "healpix"], level: int, year: int) -> Path:
gridname = _get_gridname(grid, level)
embfile = EMBEDDINGS_DIR / f"{gridname}_embeddings-{year}.parquet"
return embfile
def get_embeddings_store(grid: Literal["hex", "healpix"], level: int) -> Path:
gridname = _get_gridname(grid, level)
embstore = EMBEDDINGS_DIR / f"{gridname}_embeddings.zarr"
return embstore
def get_era5_stores(
agg: Literal["daily", "monthly", "summer", "winter", "yearly"],
grid: Literal["hex", "healpix"] | None = None,
level: int | None = None,
):
if grid is None or level is None:
return ERA5_DIR / f"{agg}_climate_unaligned.zarr"
gridname = _get_gridname(grid, level)
aligned_path = ERA5_DIR / f"{gridname}_{agg}_climate.zarr"
return aligned_path

View file

@ -0,0 +1,32 @@
"""Helpers for the watermask."""
import geopandas as gpd
from entropice.paths import watermask_file
def open():
"""Open the watermask shapefile.
Returns:
GeoDataFrame: The watermask polygons.
"""
watermask = gpd.read_file(watermask_file)
return watermask
def clip_grid(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
"""Clip the input GeoDataFrame with the watermask.
Args:
gdf (gpd.GeoDataFrame): The input GeoDataFrame to clip.
Returns:
gpd.GeoDataFrame: The clipped GeoDataFrame.
"""
watermask = open()
watermask = watermask.to_crs("EPSG:3413")
gdf = gdf.overlay(watermask, how="difference")
return gdf

View file

@ -1,66 +0,0 @@
"""Labels of Retrogressive-Thaw-Slumps (RTS).
Assumes that the level 1 and level 2 DARTS features have been downloaded into $DATA_DIR / entropyc-rts / darts: https://arcticdata.io/catalog/view/doi:10.18739/A22B8VD7C
Author: Tobias Hölzer
Date: October 2025
"""
import os
from pathlib import Path
from typing import Literal
import cyclopts
import geopandas as gpd
from rich.progress import track
DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
LEVEL2_PATH = DATA_DIR / "darts" / "DARTS_NitzeEtAl_v1-2_features_2018-2023_level2.parquet"
LEVEL2_COV_PATH = DATA_DIR / "darts" / "DARTS_NitzeEtAl_v1-2_coverage_2018-2023_level2.parquet"
def extract_darts_rts(grid: Literal["hex", "healpix"], level: int):
"""Extract RTS labels from DARTS dataset.
Args:
grid (Literal["hex", "healpix"]): The grid type to use.
level (int): The grid level to use.
"""
darts_l2 = gpd.read_parquet(LEVEL2_PATH)
darts_cov_l2 = gpd.read_parquet(LEVEL2_COV_PATH)
gridname = f"permafrost_{grid}{level}"
grid = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")
grid_l2 = grid.overlay(darts_l2.to_crs(grid.crs), how="intersection")
grid_cov_l2 = grid.overlay(darts_cov_l2.to_crs(grid.crs), how="intersection")
years = list(grid_cov_l2["year"].unique())
for year in track(years, total=len(years), description="Processing years..."):
subset = grid_l2[grid_l2["year"] == year]
subset_cov = grid_cov_l2[grid_cov_l2["year"] == year]
counts = subset.groupby("cell_id").size()
grid[f"darts_{year}_rts_count"] = grid.cell_id.map(counts)
areas = subset.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
grid[f"darts_{year}_rts_area"] = grid.cell_id.map(areas)
areas_cov = subset_cov.groupby("cell_id").apply(lambda x: x.geometry.area.sum(), include_groups=False)
grid[f"darts_{year}_covered_area"] = grid.cell_id.map(areas_cov)
grid[f"darts_{year}_coverage"] = grid[f"darts_{year}_covered_area"] / grid.geometry.area
grid[f"darts_{year}_rts_density"] = grid[f"darts_{year}_rts_area"] / grid[f"darts_{year}_covered_area"]
output_path = DATA_DIR / f"darts/{gridname}_darts.parquet"
grid.to_parquet(output_path)
print(f"Saved RTS labels to {output_path}")
def main(): # noqa: D103
cyclopts.run(extract_darts_rts)
if __name__ == "__main__":
main()

File diff suppressed because one or more lines are too long

2
uv.lock generated
View file

@ -1199,7 +1199,7 @@ wheels = [
] ]
[[package]] [[package]]
name = "entropic-perma-risk" name = "entropice"
version = "0.1.0" version = "0.1.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [