entropice/steps/s1_1_era5/era5.py

712 lines
28 KiB
Python
Raw Normal View History

2025-10-21 18:42:01 +02:00
"""Download and preprocess ERA5 data.
Variables of Interest:
- 2 metre temperature (t2m) [instant]
- Total precipitation (tp) [accum]
- Snow Fall (sf) [accum]
- Snow cover (snowc) [instant]
- Snow depth (sde) [instant]
- Surface sensible heat flux (sshf) [accum]
- Lake ice bottom temperature (lblt) [instant]
Naming patterns:
- Instant Variables are downloaded already as statistically aggregated (lossy),
therefore their names get the aggregation as suffix
2025-10-24 16:36:18 +02:00
- Accumulation Variables are downloaded as totals (sum), their names stay the same
2025-10-21 18:42:01 +02:00
Daily Variables (downloaded from hourly data):
- t2m_max
- t2m_min
2025-10-24 16:36:18 +02:00
- t2m_mean
2025-10-21 18:42:01 +02:00
- snowc_mean
- sde_mean
- lblt_max
- tp
- sf
- sshf
Derived Daily Variables:
2025-10-24 16:36:18 +02:00
- t2m_range [instant]: t2m_max - t2m_min
- t2m_avg [instant]: (t2m_max - t2m_min) / 2
- t2m_skew [instant]: (t2m_mean - t2m_min) / t2m_range
- thawing_degree_days [accum]: (t2m_avg - 273.15).clip(min=0)
- freezing_degree_days [accum]: (273.15 - t2m_avg).clip(min=0)
- thawing_days [accum]: (t2m_avg > 273.15).astype(int)
- freezing_days [accum]: (t2m_avg < 273.15).astype(int)
- precipitation_occurrences [accum]: (tp > 0.001).astype(int)
- snowfall_occurrences [accum]: (sf > 0.001).astype(int)
- naive_snow_isolation [instant]: snowc_mean * sde_mean
Monthly, Winter, Summer & Yearly Aggregations (Names don't change):
- instant variables:
- *_min -> min
- *_max -> max
- *_rest -> median
- accum variables: sum
Derived & (from monthly) Aggregated Winter Variables:
- effective_snow_depth [instant]: (sde_mean * M + 1 - m).sum(M) / (m).sum(M),see also https://tc.copernicus.org/articles/11/989/2017/tc-11-989-2017.pdf
Derived & (from daily) Aggregated Yearly Variables:
- day_of_first_thaw [yearly]: First day in year where t2m_daily_avg > 273.15
- day_of_last_thaw [yearly]: Last day in year where t2m_daily_avg > 273.15
- thawing_period_length [yearly]: day_of_last_thaw - day_of_first_thaw
- day_of_first_freeze [yearly]: First day in year where t2m_daily_avg < 273.15
- day_of_last_freeze [yearly]: Last day in year where t2m_daily_avg < 273.15
About yearly aggregates:
- A year always starts on 1st October and ends on 30th September of the next year
to better capture the Arctic seasonal cycle.
- Thus year == 2020 means 1st Oct 2019 - 30th Sep 2020
- Thus winter == 2020 means 1st Oct 2019 - 31th March 2020
- Thus summer == 2020 means 1st April 2020 - 30th Sep 2020
2025-10-21 18:42:01 +02:00
Author: Tobias Hölzer
2025-10-24 16:36:18 +02:00
Date: June to October 2025
2025-10-21 18:42:01 +02:00
"""
import os
import time
from pathlib import Path
from typing import Literal
import cyclopts
import dask.distributed as dd
import geopandas as gpd
2025-10-24 16:36:18 +02:00
import numpy as np
2025-10-21 18:42:01 +02:00
import odc.geo
import odc.geo.xr
import pandas as pd
import shapely
import shapely.ops
import xarray as xr
from rich import pretty, print, traceback
from rich.progress import track
from shapely.geometry import LineString, Polygon
2025-10-24 16:36:18 +02:00
from zarr.codecs import BloscCodec
2025-10-21 18:42:01 +02:00
traceback.install(show_locals=True, suppress=[cyclopts, xr, pd])
pretty.install()
cli = cyclopts.App()
DATA_DIR = Path(os.environ.get("DATA_DIR", "data")) / "entropyc-rts"
ERA5_DIR = DATA_DIR / "era5"
DAILY_RAW_PATH = ERA5_DIR / "daily_raw.zarr"
2025-10-24 16:36:18 +02:00
DAILY_ENRICHED_PATH = ERA5_DIR / "daily_enriched.zarr"
MONTHLY_RAW_PATH = ERA5_DIR / "monthly_raw.zarr"
YEARLY_RAW_PATH = ERA5_DIR / "yearly_aligned.zarr"
SUMMER_RAW_PATH = ERA5_DIR / "summer_aligned.zarr"
WINTER_RAW_PATH = ERA5_DIR / "winter_aligned.zarr"
2025-10-21 18:42:01 +02:00
def _get_grid_paths(
agg: Literal["daily", "monthly", "summer", "winter", "yearly"],
grid: Literal["hex", "healpix"],
level: int,
):
gridname = f"permafrost_{grid}{level}"
aligned_path = ERA5_DIR / f"{agg}_{gridname}.zarr"
return aligned_path
min_lat = 50
max_lat = 83.7 # Ensures the right Chunks Size (90 - 64 / 10 + 0.1)
min_time = "1990-01-01"
max_time = "2024-12-31"
today = time.strftime("%Y-%m-%d")
2025-10-24 16:36:18 +02:00
instants = {
"t2m_max",
"t2m_min",
"t2m_mean",
"snowc_mean",
"sde_mean",
"lblt_max",
"t2m_range",
"t2m_avg",
"t2m_skew",
"naive_snow_isolation",
}
accums = {
"tp",
"sf",
"sshf",
"thawing_degree_days",
"freezing_degree_days",
"thawing_days",
"freezing_days",
"precipitation_occurrences",
"snowfall_occurrences",
}
2025-10-21 18:42:01 +02:00
def create_encoding(ds: xr.Dataset):
"""Create compression encoding for zarr dataset storage.
Creates Blosc compression configuration for all data variables and coordinates
2025-10-24 16:36:18 +02:00
in the dataset using zstd compression with level 5.
2025-10-21 18:42:01 +02:00
Args:
ds (xr.Dataset): The xarray Dataset to create encoding for.
Returns:
dict: Encoding dictionary with compression settings for each variable.
"""
# encoding = {var: {"compressors": BloscCodec(cname="zlib", clevel=9)} for var in ds.data_vars}
2025-10-24 16:36:18 +02:00
encoding = {var: {"compressors": BloscCodec(cname="zstd", clevel=5)} for var in [*ds.data_vars, *ds.coords]}
2025-10-21 18:42:01 +02:00
return encoding
2025-10-24 16:36:18 +02:00
# ================
# === Download ===
# ================
2025-10-21 18:42:01 +02:00
def download_daily_aggregated():
"""Download and aggregate ERA5 data to daily resolution.
Downloads ERA5 reanalysis data from the DESTINE Earth Data Hub and aggregates
it to daily resolution. Includes temperature extremes, precipitation, snow,
and surface heat flux variables.
The function downloads hourly data and creates daily aggregates:
- Temperature: daily min/max
- Precipitation and snowfall: daily totals
- Snow cover and depth: daily means
- Surface heat flux: daily totals
- Lake ice temperature: daily max
The aggregated data is saved to a zarr file with compression.
"""
era5 = xr.open_dataset(
"https://data.earthdatahub.destine.eu/era5/reanalysis-era5-land-no-antartica-v0.zarr",
storage_options={"client_kwargs": {"trust_env": True}},
chunks={},
# chunks={},
engine="zarr",
).rename({"valid_time": "time"})
subset = {
"latitude": slice(max_lat, min_lat),
}
# Compute the clostest chunk-start to min_time, to avoid problems with cropped chunks at the start
tchunksize = era5.chunksizes["time"][0]
era5_chunk_starts = pd.date_range(era5.time.min().item(), era5.time.max().item(), freq=f"{tchunksize}h")
closest_chunk_start = era5_chunk_starts[
era5_chunk_starts.get_indexer([pd.to_datetime(min_time)], method="ffill")[0]
]
subset["time"] = slice(str(closest_chunk_start), max_time)
era5 = era5.sel(**subset)
daily_raw = xr.merge(
[
# Instant
era5.t2m.resample(time="1D").max().rename("t2m_max"),
era5.t2m.resample(time="1D").min().rename("t2m_min"),
2025-10-24 16:36:18 +02:00
era5.t2m.resample(time="1D").mean().rename("t2m_mean"),
2025-10-21 18:42:01 +02:00
era5.snowc.resample(time="1D").mean().rename("snowc_mean"),
era5.sde.resample(time="1D").mean().rename("sde_mean"),
era5.lblt.resample(time="1D").max().rename("lblt_max"),
# Accum
era5.tp.resample(time="1D").sum().rename("tp"),
era5.sf.resample(time="1D").sum().rename("sf"),
era5.sshf.resample(time="1D").sum().rename("sshf"),
]
)
# Assign attributes
daily_raw["t2m_max"].attrs = {"long_name": "Daily maximum 2 metre temperature", "units": "K"}
daily_raw["t2m_min"].attrs = {"long_name": "Daily minimum 2 metre temperature", "units": "K"}
2025-10-24 16:36:18 +02:00
daily_raw["t2m_mean"].attrs = {"long_name": "Daily mean 2 metre temperature", "units": "K"}
2025-10-21 18:42:01 +02:00
daily_raw["tp"].attrs = {"long_name": "Daily total precipitation", "units": "m"}
daily_raw["sf"].attrs = {"long_name": "Daily total snow fall", "units": "m"}
daily_raw["snowc_mean"].attrs = {"long_name": "Daily mean snow cover", "units": "m"}
daily_raw["sde_mean"].attrs = {"long_name": "Daily mean snow depth", "units": "m"}
daily_raw["sshf"].attrs = {"long_name": "Daily total surface sensible heat flux", "units": "J/m²"}
daily_raw["lblt_max"].attrs = {"long_name": "Daily maximum lake ice bottom temperature", "units": "K"}
daily_raw = daily_raw.odc.assign_crs("epsg:4326")
daily_raw = daily_raw.drop_vars(["surface", "number", "depthBelowLandLayer"])
daily_raw.to_zarr(DAILY_RAW_PATH, mode="w", encoding=create_encoding(daily_raw), consolidated=False)
@cli.command
def download():
"""Download ERA5 data using Dask cluster for parallel processing.
Creates a local Dask cluster and downloads daily aggregated ERA5 data.
The cluster is configured with a single worker with 10 threads and 100GB
memory limit for optimal performance.
"""
with (
dd.LocalCluster(n_workers=1, threads_per_worker=10, memory_limit="100GB") as cluster,
dd.Client(cluster) as client,
):
print(client)
print(client.dashboard_link)
download_daily_aggregated()
print(f"Downloaded and aggregated ERA5 data to {DAILY_RAW_PATH.resolve()}.")
# ============================
# === Temporal Aggregation ===
# ============================
2025-10-24 16:36:18 +02:00
def daily_enrich():
2025-10-21 18:42:01 +02:00
"""Enrich daily ERA5 data with derived climate variables.
2025-10-24 16:36:18 +02:00
Loads downloaded daily ERA5 data and computes additional climate variables.
2025-10-21 18:42:01 +02:00
Creates derived variables including temperature statistics, degree days, and occurrence indicators.
Derived variables include:
- Daily average and range temperature
- Temperature skewness
- Thawing and freezing degree days
- Thawing and freezing day counts
- Precipitation and snowfall occurrences
- Snow isolation index
"""
2025-10-24 16:36:18 +02:00
daily = xr.open_zarr(DAILY_RAW_PATH, consolidated=False).set_coords("spatial_ref")
assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
2025-10-21 18:42:01 +02:00
# Formulas based on Groeke et. al. (2025) Stochastic Weather generation...
daily["t2m_avg"] = (daily.t2m_max + daily.t2m_min) / 2
daily.t2m_avg.attrs = {"long_name": "Daily average 2 metre temperature", "units": "K"}
daily["t2m_range"] = daily.t2m_max - daily.t2m_min
daily.t2m_range.attrs = {"long_name": "Daily range of 2 metre temperature", "units": "K"}
2025-10-24 16:36:18 +02:00
daily["t2m_skew"] = (daily.t2m_mean - daily.t2m_min) / daily.t2m_range
2025-10-21 18:42:01 +02:00
daily.t2m_skew.attrs = {"long_name": "Daily skewness of 2 metre temperature"}
daily["thawing_degree_days"] = (daily.t2m_avg - 273.15).clip(min=0)
daily.thawing_degree_days.attrs = {"long_name": "Thawing degree days", "units": "K"}
daily["freezing_degree_days"] = (273.15 - daily.t2m_avg).clip(min=0)
daily.freezing_degree_days.attrs = {"long_name": "Freezing degree days", "units": "K"}
daily["thawing_days"] = (daily.t2m_avg > 273.15).astype(int)
daily.thawing_days.attrs = {"long_name": "Thawing days"}
daily["freezing_days"] = (daily.t2m_avg < 273.15).astype(int)
daily.freezing_days.attrs = {"long_name": "Freezing days"}
daily["precipitation_occurrences"] = (daily.tp > 0).astype(int)
daily.precipitation_occurrences.attrs = {"long_name": "Precipitation occurrences"}
daily["snowfall_occurrences"] = (daily.sf > 0).astype(int)
daily.snowfall_occurrences.attrs = {"long_name": "Snowfall occurrences"}
2025-10-24 16:36:18 +02:00
daily["naive_snow_isolation"] = daily.snowc_mean * daily.sde_mean
daily.naive_snow_isolation.attrs = {"long_name": "Naive snow isolation"}
2025-10-21 18:42:01 +02:00
2025-10-24 16:36:18 +02:00
daily.to_zarr(DAILY_ENRICHED_PATH, mode="w", encoding=create_encoding(daily), consolidated=False)
2025-10-21 18:42:01 +02:00
2025-10-24 16:36:18 +02:00
def monthly_aggregate():
2025-10-21 18:42:01 +02:00
"""Aggregate enriched daily ERA5 data to monthly resolution.
Takes the enriched daily ERA5 data and creates monthly aggregates using
2025-10-24 16:36:18 +02:00
appropriate statistical functions for each variable type.
Instant variables use min, max, or median aggregations, while accumulative
variables are summed over the month.
2025-10-21 18:42:01 +02:00
The aggregated monthly data is saved to a zarr file for further processing.
"""
2025-10-24 16:36:18 +02:00
daily = xr.open_zarr(DAILY_ENRICHED_PATH, consolidated=False)
assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
daily = daily.sel(time=slice(min_time, max_time))
# Monthly instant aggregates
monthly_instants = []
for var in instants:
if var.endswith("_min"):
agg = daily[var].resample(time="1ME").min().rename(var)
agg.attrs = daily[var].attrs
agg.attrs["long_name"] = f"Monthly minimum of {daily[var].attrs.get('long_name', var)}"
monthly_instants.append(agg)
elif var.endswith("_max"):
agg = daily[var].resample(time="1ME").max().rename(var)
agg.attrs = daily[var].attrs
agg.attrs["long_name"] = f"Monthly maximum of {daily[var].attrs.get('long_name', var)}"
monthly_instants.append(agg)
else:
agg = daily[var].resample(time="1ME").median().rename(var)
agg.attrs = daily[var].attrs
agg.attrs["long_name"] = f"Monthly median of {daily[var].attrs.get('long_name', var)}"
monthly_instants.append(agg)
monthly_accums = []
for var in accums:
agg = daily[var].resample(time="1ME").sum().rename(var)
agg.attrs = daily[var].attrs
monthly_accums.append(agg)
monthly = xr.merge(monthly_instants + monthly_accums)
monthly = monthly.chunk({"time": len(monthly.time), "latitude": 64, "longitude": 64})
monthly.to_zarr(MONTHLY_RAW_PATH, mode="w", encoding=create_encoding(monthly), consolidated=False)
def multi_monthly_aggregate(monthly: xr.Dataset, n: int = 12) -> xr.Dataset:
"""Aggregate monthly ERA5 data to a multi-month resolution.
Takes monthly aggregated data and creates multi-month aggregates using a shifted
2025-10-21 18:42:01 +02:00
calendar (October to September) to better capture Arctic seasonal patterns.
Args:
monthly (xr.Dataset): The monthly aggregates
2025-10-24 16:36:18 +02:00
n (int, optional): Number of months to aggregate over. Defaults to 12.
2025-10-21 18:42:01 +02:00
Returns:
xr.Dataset: The aggregated dataset
"""
2025-10-24 16:36:18 +02:00
# Instants
multimonthly_instants = []
for var in instants:
if var.endswith("_min"):
agg = monthly[var].resample(time=f"{n}MS", label="right").min().rename(var)
agg.attrs = monthly[var].attrs
agg.attrs["long_name"] = f"{n}-Monthly minimum of {monthly[var].attrs.get('long_name', var)}"
multimonthly_instants.append(agg)
elif var.endswith("_max"):
agg = monthly[var].resample(time=f"{n}MS", label="right").max().rename(var)
agg.attrs = monthly[var].attrs
agg.attrs["long_name"] = f"{n}-Monthly maximum of {monthly[var].attrs.get('long_name', var)}"
multimonthly_instants.append(agg)
else:
agg = monthly[var].resample(time=f"{n}MS", label="right").median().rename(var)
agg.attrs = monthly[var].attrs
agg.attrs["long_name"] = f"{n}-Monthly median of {monthly[var].attrs.get('long_name', var)}"
multimonthly_instants.append(agg)
# Accums
multimonthly_accums = []
for var in accums:
agg = monthly[var].resample(time=f"{n}MS", label="right").sum().rename(var)
agg.attrs = monthly[var].attrs
multimonthly_accums.append(agg)
multimonthly = xr.merge(multimonthly_instants + multimonthly_accums)
# Effective snow depth
m = np.resize(np.arange(1, n + 1), len(monthly.time))
m = xr.DataArray(m, coords={"time": monthly.time}, dims=["time"])
n_sum = n * (n + 1) // 2
multimonthly["effective_snow_depth"] = (monthly.sde_mean * (n + 1 - m)).resample(time=f"{n}MS").sum().rename(
"effective_snow_depth"
) / n_sum
multimonthly["effective_snow_depth"].attrs = {
"long_name": "Effective Snow Density",
"reference": "Slater et. al. (2017)",
"link": "https://tc.copernicus.org/articles/11/989/2017/tc-11-989-2017.pdf",
}
multimonthly = multimonthly.chunk({"time": len(multimonthly.time), "latitude": 64, "longitude": 64})
return multimonthly
2025-10-21 18:42:01 +02:00
2025-10-24 16:36:18 +02:00
def yearly_and_seasonal_aggregate():
2025-10-21 18:42:01 +02:00
"""Aggregate monthly ERA5 data to yearly resolution with seasonal splits.
Takes monthly aggregated data and creates yearly aggregates using a shifted
calendar (October to September) to better capture Arctic seasonal patterns.
Creates separate aggregates for full year, winter (Oct-Apr), and summer
(May-Sep) periods.
The first and last incomplete years are excluded from the analysis.
Winter months are defined as months 1-7 in the shifted calendar,
and summer months are 8-12.
The final dataset includes yearly, winter, and summer aggregates for all
climate variables, saved to a zarr file.
"""
2025-10-24 16:36:18 +02:00
monthly = xr.open_zarr(MONTHLY_RAW_PATH, consolidated=False).set_coords("spatial_ref")
assert "time" in monthly.dims, f"Expected dim 'time' to be in {monthly.dims=}"
# "Shift" the calendar by slicing the first Jan-Sep and the last Oct-Dec months
first_year = monthly.time.dt.year.min().item()
last_year = monthly.time.dt.year.max().item()
monthly = monthly.sel(time=slice(f"{first_year}-10-01", f"{last_year}-09-30"))
yearly = multi_monthly_aggregate(monthly, n=12)
yearly = derive_yearly_variables(yearly)
yearly.to_zarr(YEARLY_RAW_PATH, mode="w", encoding=create_encoding(yearly), consolidated=False)
2025-10-21 18:42:01 +02:00
2025-10-24 16:36:18 +02:00
summer_winter = multi_monthly_aggregate(monthly, n=6)
2025-10-21 18:42:01 +02:00
2025-10-24 16:36:18 +02:00
summer = summer_winter.sel(time=summer_winter.time.dt.month == 4)
summer.to_zarr(SUMMER_RAW_PATH, mode="w", encoding=create_encoding(summer), consolidated=False)
2025-10-21 18:42:01 +02:00
2025-10-24 16:36:18 +02:00
winter = summer_winter.sel(time=summer_winter.time.dt.month == 10)
winter.to_zarr(WINTER_RAW_PATH, mode="w", encoding=create_encoding(winter), consolidated=False)
2025-10-21 18:42:01 +02:00
2025-10-24 16:36:18 +02:00
def derive_yearly_variables(yearly: xr.Dataset) -> xr.Dataset:
"""Derive additional variables from daily data and add them to the yearly dataset.
Args:
yearly (xr.Dataset): The yearly aggregated dataset to enrich.
Returns:
xr.Dataset: The enriched yearly dataset with additional derived variables.
"""
assert "time" in yearly.dims, f"Expected dim 'time' to be in {yearly.dims=}"
daily = xr.open_zarr(DAILY_ENRICHED_PATH, consolidated=False).set_coords("spatial_ref")
assert "time" in daily.dims, f"Expected dim 'time' to be in {daily.dims=}"
daily = daily.sel(time=slice(min_time, max_time))
# ? Note: The functions do not really account for leap years
# n_days_in_year = daily.time.groupby("time.year").count().rename("n_days_in_year")
n_days_in_year = 365
# A mask to check which places never thaws
# Persist in memory because we need it twice and this dramatically reduces the Dask-Graph size
never_thaws = (daily.thawing_days.groupby("time.year").sum(dim="time") == 0).compute()
# ? First and last thaw day is NOT calculated within the october-september year, but within the calendar year
# This results in a much more correct representation of thawing periods in regions where the last thawing day
# is between october and december.
# This assumes that the 01-01 is almost everywhere one of the coldest days in the year
first_thaw_day = daily.thawing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
first_thaw_day = first_thaw_day.where(~never_thaws).rename("day_of_first_thaw").rename(year="time")
first_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_thaw_day.time.values]) # noqa: PD011
first_thaw_day.attrs = {"long_name": "Day of first thaw in year", "units": "day of year"}
yearly["day_of_first_thaw"] = first_thaw_day.sel(time=yearly.time)
last_thaw_day = (
n_days_in_year - daily.thawing_days[::-1].groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
)
last_thaw_day = last_thaw_day.where(~never_thaws).rename("day_of_last_thaw").rename(year="time")
last_thaw_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_thaw_day.time.values]) # noqa: PD011
last_thaw_day.attrs = {"long_name": "Day of last thaw in year", "units": "day of year"}
yearly["day_of_last_thaw"] = last_thaw_day.sel(time=yearly.time)
yearly["thawing_period_length"] = (yearly.day_of_last_thaw - yearly.day_of_first_thaw).rename(
"thawing_period_length"
)
yearly.thawing_period_length.attrs = {"long_name": "Thawing period length in year", "units": "days"}
# ? First and last freeze day is NOT calculated within the october-september year, but within an july-june year
# This results, similar to the thawing days, in a much more correct representation of freezing periods in regions
# where the first freezing day is between july and september.
# This assumes that the 01-07 is almost everywhere one of the warmest days in the year
daily_shifted = daily.copy()
daily_shifted["time"] = pd.to_datetime(daily_shifted.time.values) + pd.DateOffset(months=6)
# A mask to check which places never freeze
# Persist in memory because we need it twice and this dramatically reduces the Dask-Graph size
never_freezes = (daily_shifted.freezing_days.groupby("time.year").sum(dim="time") == 0).compute()
first_freezing_day = daily_shifted.freezing_days.groupby("time.year").apply(lambda x: x.argmax(dim="time")) + 1
first_freezing_day = first_freezing_day.where(~never_freezes).rename("day_of_first_freeze").rename(year="time")
first_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in first_freezing_day.time.values]) # noqa: PD011
first_freezing_day.attrs = {"long_name": "Day of first freeze in year", "units": "day of year"}
yearly["day_of_first_freeze"] = first_freezing_day.sel(time=yearly.time)
last_freezing_day = (
n_days_in_year
- daily_shifted.freezing_days[::-1].groupby("time.year").apply(lambda x: x.argmax(dim="time"))
+ 1
)
last_freezing_day = last_freezing_day.where(~never_freezes).rename("day_of_last_freeze").rename(year="time")
last_freezing_day["time"] = pd.to_datetime([f"{y}-10-01" for y in last_freezing_day.time.values]) # noqa: PD011
last_freezing_day.attrs = {"long_name": "Day of last freeze in year", "units": "day of year"}
yearly["day_of_last_freeze"] = last_freezing_day.sel(time=yearly.time)
2025-10-21 18:42:01 +02:00
2025-10-24 16:36:18 +02:00
yearly["freezing_period_length"] = (yearly.day_of_last_freeze - yearly.day_of_first_freeze).rename(
"freezing_period_length"
)
yearly.freezing_period_length.attrs = {"long_name": "Freezing period length in year", "units": "days"}
2025-10-21 18:42:01 +02:00
2025-10-24 16:36:18 +02:00
return yearly
2025-10-21 18:42:01 +02:00
@cli.command
2025-10-24 16:36:18 +02:00
def enrich(n_workers: int = 10, monthly: bool = True, yearly: bool = True, daily: bool = True):
"""Enrich data and pPerform temporal aggregation of ERA5 data using Dask cluster.
2025-10-21 18:42:01 +02:00
Creates a Dask cluster and runs both monthly and yearly aggregation
functions to generate temporally aggregated climate datasets. The
processing uses parallel workers for efficient computation.
Args:
n_workers (int, optional): Number of Dask workers to use. Defaults to 10.
2025-10-24 16:36:18 +02:00
monthly (bool, optional): Whether to perform monthly aggregation. Defaults to True.
yearly (bool, optional): Whether to perform yearly aggregation. Defaults to True.
daily (bool, optional): Whether to perform daily enrichment. Defaults to True.
2025-10-21 18:42:01 +02:00
"""
with (
dd.LocalCluster(n_workers=n_workers, threads_per_worker=20, memory_limit="10GB") as cluster,
dd.Client(cluster) as client,
):
print(client)
print(client.dashboard_link)
2025-10-24 16:36:18 +02:00
if daily:
daily_enrich()
if monthly:
monthly_aggregate()
if yearly:
yearly_and_seasonal_aggregate()
2025-10-21 18:42:01 +02:00
print("Enriched ERA5 data with additional features and aggregated it temporally.")
2025-10-24 16:36:18 +02:00
# ===========================
# === Spatial Aggregation ===
# ===========================
def _crosses_antimeridian(geom: Polygon) -> bool:
coords = shapely.get_coordinates(geom)
crosses_any_meridian = (coords[:, 0] > 0).any() and (coords[:, 0] < 0).any()
return crosses_any_meridian and abs(coords[:, 0]).max() > 90
def _split_antimeridian_cell(geom: Polygon) -> list[Polygon]:
# Assumes that it is a antimeridian hex
coords = shapely.get_coordinates(geom)
for i in range(coords.shape[0]):
if coords[i, 0] < 0:
coords[i, 0] += 360
geom = Polygon(coords)
antimeridian = LineString([[180, -90], [180, 90]])
polys = shapely.ops.split(geom, antimeridian)
return list(polys.geoms)
def _check_geom(geobox: odc.geo.geobox.GeoBox, geom: odc.geo.Geometry) -> bool:
enclosing = geobox.enclosing(geom)
x, y = enclosing.shape
if x <= 1 or y <= 1:
return False
roi: tuple[slice, slice] = geobox.overlap_roi(enclosing)
roix, roiy = roi
return (roix.stop - roix.start) > 1 and (roiy.stop - roiy.start) > 1
def extract_cell_data(yearly: xr.Dataset, geom: Polygon):
"""Extract ERA5 data for a specific grid cell geometry.
Extracts and spatially averages ERA5 data within the bounds of a grid cell.
Handles antimeridian-crossing cells by splitting them appropriately.
Args:
yearly (xr.Dataset): Yearly aggregated ERA5 dataset.
geom (Polygon): Polygon geometry of the grid cell.
"""
# cell.geometry is a shapely Polygon
if not _crosses_antimeridian(geom):
geoms = [geom]
# Split geometry in case it crossed antimeridian
else:
geoms = _split_antimeridian_cell(geom)
cell_data = []
for geom in geoms:
geom = odc.geo.Geometry(geom, crs="epsg:4326")
if not _check_geom(yearly.odc.geobox, geom):
continue
cell_data.append(yearly.odc.crop(geom).drop_vars("spatial_ref").mean(["latitude", "longitude"]))
if len(cell_data) == 0:
return False
elif len(cell_data) == 1:
cell_data = cell_data[0]
else:
cell_data = xr.concat(cell_data, dim="part").mean("part")
cell_data = cell_data.compute()
return cell_data
@cli.command
def spatial_agg(
grid: Literal["hex", "healpix"],
level: int,
agg: Literal["summer", "winter", "yearly"] = "yearly",
n_workers: int = 10,
):
"""Perform spatial aggregation of ERA5 data to grid cells.
Loads a grid and spatially aggregates ERA5 data to each grid cell using
parallel processing. Creates an empty zarr file first, then fills it
with extracted data for each cell.
Args:
grid ("hex" | "healpix"): Grid type.
level (int): Grid resolution level.
agg ("summer" | "winter" | "yearly"): Type of aggregation to perform. Defaults to yearly.
n_workers (int, optional): Number of parallel workers to use. Defaults to 10.
"""
gridname = f"permafrost_{grid}{level}"
agg_grid_path = _get_grid_paths(agg, grid, level)
grid_df = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")
# Create an empty zarr array with the right dimensions
if agg == "summer":
agg_data_path = SUMMER_RAW_PATH
elif agg == "winter":
agg_data_path = WINTER_RAW_PATH
elif agg == "yearly":
agg_data_path = YEARLY_RAW_PATH
else:
raise ValueError(f"Unknown aggregation type: {agg}")
agg_raw = (
xr.open_zarr(agg_data_path, consolidated=False, decode_timedelta=False)
.set_coords("spatial_ref")
.drop_vars(["surface", "number", "depthBelowLandLayer"])
.load()
)
assert {"latitude", "longitude", "time"} == set(agg_raw.dims), (
f"Expected dims ('latitude', 'longitude', 'time'), got {agg_raw.dims}"
)
assert agg_raw.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {agg_raw.odc.crs}"
# Convert lons to -180 to 180 instead of 0 to 360
agg_raw = agg_raw.assign_coords(longitude=(((agg_raw.longitude + 180) % 360) - 180)).sortby("longitude")
# ? Converting cell IDs from hex strings to integers for xdggs compatibility
cells = [int(cid, 16) for cid in grid_df.cell_id.to_list()]
agg_aligned = (
xr.zeros_like(agg_raw.isel(latitude=0, longitude=0).drop_vars(["latitude", "longitude"]))
.expand_dims({"cell_ids": cells})
.chunk({"cell_ids": min(len(grid_df), 10000), "time": len(agg_raw.time)})
)
agg_aligned.cell_ids.attrs = {
"grid_name": "h3" if grid == "hex" else grid,
"level": level,
}
if grid == "healpix":
agg_aligned.cell_ids.attrs["indexing_scheme"] = "nested"
from stopuhr import stopwatch
for _, row in track(
grid_df.to_crs("epsg:4326").iterrows(),
total=len(grid_df),
description="Spatially aggregating ERA5 data...",
):
cell_id = int(row.cell_id, 16)
with stopwatch("Extracting cell data", log=False):
cell_data = extract_cell_data(agg_raw, row.geometry)
if cell_data is False:
print(f"Warning: No data found for cell {cell_id}, skipping.")
continue
with stopwatch("Assigning cell data", log=False):
agg_aligned.loc[{"cell_ids": cell_id}] = cell_data
agg_aligned.to_zarr(agg_grid_path, mode="w", consolidated=False, encoding=create_encoding(agg_aligned))
print("Finished spatial matching.")
stopwatch.summary()
2025-10-21 18:42:01 +02:00
if __name__ == "__main__":
cli()