Make aggregations work
This commit is contained in:
parent
98314fe8b3
commit
7b09dda6a3
6 changed files with 328 additions and 97 deletions
|
|
@ -38,18 +38,7 @@ All spatial aggregations relied heavily on CPU compute, since Cupy lacking suppo
|
||||||
and for higher resolution grids the amount of pixels to reduce where too small to overcome the data movement overhead of using a GPU.
|
and for higher resolution grids the amount of pixels to reduce where too small to overcome the data movement overhead of using a GPU.
|
||||||
|
|
||||||
The aggregations scale through the number of concurrent processes (specified by `--concurrent_partitions`) accumulating linearly more memory with higher parallel computation.
|
The aggregations scale through the number of concurrent processes (specified by `--concurrent_partitions`) accumulating linearly more memory with higher parallel computation.
|
||||||
|
All spatial aggregations into the different grids done took around 30 min each, with a total memory peak of ~300 GB partitioned over 40 processes.
|
||||||
| grid | time | memory | processes |
|
|
||||||
| ----- | ------ | ------ | --------- |
|
|
||||||
| Hex3 | | | |
|
|
||||||
| Hex4 | | | |
|
|
||||||
| Hex5 | | | |
|
|
||||||
| Hex6 | | | |
|
|
||||||
| Hpx6 | 37 min | ~300GB | 40 |
|
|
||||||
| Hpx7 | | | |
|
|
||||||
| Hpx8 | | | |
|
|
||||||
| Hpx9 | 25m | ~300GB | 40 |
|
|
||||||
| Hpx10 | 34 min | ~300GB | 40 |
|
|
||||||
|
|
||||||
## Alpha Earth
|
## Alpha Earth
|
||||||
|
|
||||||
|
|
@ -71,4 +60,31 @@ Each scale was choosen so that each grid cell had around 10000px do estimate the
|
||||||
|
|
||||||
## Era5
|
## Era5
|
||||||
|
|
||||||
|
### Spatial aggregations into grids
|
||||||
|
|
||||||
|
All spatial aggregations relied heavily on CPU compute, since Cupy lacking support for nanquantile
|
||||||
|
and for higher resolution grids the amount of pixels to reduce where too small to overcome the data movement overhead of using a GPU.
|
||||||
|
|
||||||
|
The aggregations scale through the number of concurrent processes (specified by `--concurrent_partitions`) accumulating linearly more memory with higher parallel computation.
|
||||||
|
|
||||||
|
Since the resolution of the ERA5 dataset is spatially smaller than the resolution of the higher-resolution, different aggregations methods where used for different grid-levels:
|
||||||
|
|
||||||
|
- Common aggregations: mean, min, max, std, median, p01, p05, p25, p75, p95, p99 for low resolution grids
|
||||||
|
- Only mean aggregations for medium resolution grids
|
||||||
|
- Linar interpolation for high resolution grids
|
||||||
|
|
||||||
|
For geometries crossing the antimeridian, geometries are corrected.
|
||||||
|
|
||||||
|
| grid | method |
|
||||||
|
| ----- | ----------- |
|
||||||
|
| Hex3 | Common |
|
||||||
|
| Hex4 | Common |
|
||||||
|
| Hex5 | Mean |
|
||||||
|
| Hex6 | Interpolate |
|
||||||
|
| Hpx6 | Common |
|
||||||
|
| Hpx7 | Common |
|
||||||
|
| Hpx8 | Common |
|
||||||
|
| Hpx9 | Mean |
|
||||||
|
| Hpx10 | Interpolate |
|
||||||
|
|
||||||
???
|
???
|
||||||
|
|
|
||||||
27
pixi.lock
generated
27
pixi.lock
generated
|
|
@ -462,6 +462,7 @@ environments:
|
||||||
- pypi: https://files.pythonhosted.org/packages/10/a1/510b0a7fadc6f43a6ce50152e69dbd86415240835868bb0bd9b5b88b1e06/aioitertools-0.13.0-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/10/a1/510b0a7fadc6f43a6ce50152e69dbd86415240835868bb0bd9b5b88b1e06/aioitertools-0.13.0-py3-none-any.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/aa/f3/0b6ced594e51cc95d8c1fc1640d3623770d01e4969d29c0bd09945fafefa/altair-5.5.0-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/aa/f3/0b6ced594e51cc95d8c1fc1640d3623770d01e4969d29c0bd09945fafefa/altair-5.5.0-py3-none-any.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/c8/a7/a597ff7dd1e1603abd94991ce242f93979d5f10b0d45ed23976dfb22bf64/altair_tiles-0.4.0-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/c8/a7/a597ff7dd1e1603abd94991ce242f93979d5f10b0d45ed23976dfb22bf64/altair_tiles-0.4.0-py3-none-any.whl
|
||||||
|
- pypi: https://files.pythonhosted.org/packages/69/ce/68d6e31f0a75a5cccc03535e47434c0ca4be37fe950e93117e455cbc362c/antimeridian-0.4.5-py3-none-any.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/5b/03/c17464bbf682ea87e7e3de2ddc63395e359a78ae9c01f55fc78759ecbd79/anywidget-0.9.21-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/5b/03/c17464bbf682ea87e7e3de2ddc63395e359a78ae9c01f55fc78759ecbd79/anywidget-0.9.21-py3-none-any.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/e0/b1/0542e0cab6f49f151a2d7a42400f84f706fc0b64e85dc1f56708b2e9fd37/array_api_compat-1.12.0-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/e0/b1/0542e0cab6f49f151a2d7a42400f84f706fc0b64e85dc1f56708b2e9fd37/array_api_compat-1.12.0-py3-none-any.whl
|
||||||
|
|
@ -497,6 +498,7 @@ environments:
|
||||||
- pypi: https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/11/a8/c6a4b901d17399c77cd81fb001ce8961e9f5e04d3daf27e8925cb012e163/docutils-0.22.3-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/11/a8/c6a4b901d17399c77cd81fb001ce8961e9f5e04d3daf27e8925cb012e163/docutils-0.22.3-py3-none-any.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/0c/d5/c5db1ea3394c6e1732fb3286b3bd878b59507a8f77d32a2cebda7d7b7cd4/donfig-0.8.1.post1-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/0c/d5/c5db1ea3394c6e1732fb3286b3bd878b59507a8f77d32a2cebda7d7b7cd4/donfig-0.8.1.post1-py3-none-any.whl
|
||||||
|
- pypi: https://files.pythonhosted.org/packages/82/29/153d1b4fc14c68e6766d7712d35a7ab6272a801c52160126ac7df681f758/duckdb-1.4.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/91/bd/d501c3c3602e70d1d729f042ae0b85446a1213a630a7a4290f361b37d9a8/earthengine_api-1.7.1-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/91/bd/d501c3c3602e70d1d729f042ae0b85446a1213a630a7a4290f361b37d9a8/earthengine_api-1.7.1-py3-none-any.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/a3/cf/7feb3222d770566ca9eaf0bf6922745fadd1ed7ab11832520063a515c240/ecmwf_datastores_client-0.4.1-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/a3/cf/7feb3222d770566ca9eaf0bf6922745fadd1ed7ab11832520063a515c240/ecmwf_datastores_client-0.4.1-py3-none-any.whl
|
||||||
- pypi: https://files.pythonhosted.org/packages/65/54/5e3b0e41799e17e5eff1547fda4aab53878c0adb4243de6b95f8ddef899e/ee_extra-2025.7.2-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/65/54/5e3b0e41799e17e5eff1547fda4aab53878c0adb4243de6b95f8ddef899e/ee_extra-2025.7.2-py3-none-any.whl
|
||||||
|
|
@ -788,6 +790,15 @@ packages:
|
||||||
- jupyter-book ; extra == 'doc'
|
- jupyter-book ; extra == 'doc'
|
||||||
- vl-convert-python ; extra == 'doc'
|
- vl-convert-python ; extra == 'doc'
|
||||||
requires_python: '>=3.9'
|
requires_python: '>=3.9'
|
||||||
|
- pypi: https://files.pythonhosted.org/packages/69/ce/68d6e31f0a75a5cccc03535e47434c0ca4be37fe950e93117e455cbc362c/antimeridian-0.4.5-py3-none-any.whl
|
||||||
|
name: antimeridian
|
||||||
|
version: 0.4.5
|
||||||
|
sha256: 8b1f82c077d2c48eae0a6606759cfec9133a0701250371cb707a56959451d9dd
|
||||||
|
requires_dist:
|
||||||
|
- numpy>=1.22.4
|
||||||
|
- shapely>=2.0
|
||||||
|
- click>=8.1.6 ; extra == 'cli'
|
||||||
|
requires_python: '>=3.10'
|
||||||
- conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.11.0-pyhcf101f3_0.conda
|
- conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.11.0-pyhcf101f3_0.conda
|
||||||
sha256: 7378b5b9d81662d73a906fabfc2fb81daddffe8dc0680ed9cda7a9562af894b0
|
sha256: 7378b5b9d81662d73a906fabfc2fb81daddffe8dc0680ed9cda7a9562af894b0
|
||||||
md5: 814472b61da9792fae28156cb9ee54f5
|
md5: 814472b61da9792fae28156cb9ee54f5
|
||||||
|
|
@ -2763,6 +2774,18 @@ packages:
|
||||||
- pytest ; extra == 'test'
|
- pytest ; extra == 'test'
|
||||||
- cloudpickle ; extra == 'test'
|
- cloudpickle ; extra == 'test'
|
||||||
requires_python: '>=3.8'
|
requires_python: '>=3.8'
|
||||||
|
- pypi: https://files.pythonhosted.org/packages/82/29/153d1b4fc14c68e6766d7712d35a7ab6272a801c52160126ac7df681f758/duckdb-1.4.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl
|
||||||
|
name: duckdb
|
||||||
|
version: 1.4.2
|
||||||
|
sha256: a456adbc3459c9dcd99052fad20bd5f8ef642be5b04d09590376b2eb3eb84f5c
|
||||||
|
requires_dist:
|
||||||
|
- ipython ; extra == 'all'
|
||||||
|
- fsspec ; extra == 'all'
|
||||||
|
- numpy ; extra == 'all'
|
||||||
|
- pandas ; extra == 'all'
|
||||||
|
- pyarrow ; extra == 'all'
|
||||||
|
- adbc-driver-manager ; extra == 'all'
|
||||||
|
requires_python: '>=3.9.0'
|
||||||
- pypi: https://files.pythonhosted.org/packages/91/bd/d501c3c3602e70d1d729f042ae0b85446a1213a630a7a4290f361b37d9a8/earthengine_api-1.7.1-py3-none-any.whl
|
- pypi: https://files.pythonhosted.org/packages/91/bd/d501c3c3602e70d1d729f042ae0b85446a1213a630a7a4290f361b37d9a8/earthengine_api-1.7.1-py3-none-any.whl
|
||||||
name: earthengine-api
|
name: earthengine-api
|
||||||
version: 1.7.1
|
version: 1.7.1
|
||||||
|
|
@ -2819,7 +2842,7 @@ packages:
|
||||||
- pypi: ./
|
- pypi: ./
|
||||||
name: entropice
|
name: entropice
|
||||||
version: 0.1.0
|
version: 0.1.0
|
||||||
sha256: d22e8659bedd1389a563f9cc66c579cad437d279597fa5b21126dda3bb856a30
|
sha256: c335ffb8f5ffc53929fcd9d656087692b6e9918938384df60d136124ca5365bc
|
||||||
requires_dist:
|
requires_dist:
|
||||||
- aiohttp>=3.12.11
|
- aiohttp>=3.12.11
|
||||||
- bokeh>=3.7.3
|
- bokeh>=3.7.3
|
||||||
|
|
@ -2875,6 +2898,8 @@ packages:
|
||||||
- cupy-xarray>=0.1.4,<0.2
|
- cupy-xarray>=0.1.4,<0.2
|
||||||
- memray>=1.19.1,<2
|
- memray>=1.19.1,<2
|
||||||
- xarray-histogram>=0.2.2,<0.3
|
- xarray-histogram>=0.2.2,<0.3
|
||||||
|
- antimeridian>=0.4.5,<0.5
|
||||||
|
- duckdb>=1.4.2,<2
|
||||||
requires_python: '>=3.13,<3.14'
|
requires_python: '>=3.13,<3.14'
|
||||||
editable: true
|
editable: true
|
||||||
- pypi: git+ssh://git@forgejo.tobiashoelzer.de:22222/tobias/entropy.git#9ca1bdf4afc4ac9b0ea29ebbc060ffecb5cffcf7
|
- pypi: git+ssh://git@forgejo.tobiashoelzer.de:22222/tobias/entropy.git#9ca1bdf4afc4ac9b0ea29ebbc060ffecb5cffcf7
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ dependencies = [
|
||||||
"xgboost>=3.1.1,<4",
|
"xgboost>=3.1.1,<4",
|
||||||
"s3fs>=2025.10.0,<2026",
|
"s3fs>=2025.10.0,<2026",
|
||||||
"xarray-spatial",
|
"xarray-spatial",
|
||||||
"cupy-xarray>=0.1.4,<0.2", "memray>=1.19.1,<2", "xarray-histogram>=0.2.2,<0.3",
|
"cupy-xarray>=0.1.4,<0.2", "memray>=1.19.1,<2", "xarray-histogram>=0.2.2,<0.3", "antimeridian>=0.4.5,<0.5", "duckdb>=1.4.2,<2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
"""Aggregation helpers."""
|
"""Aggregation helpers."""
|
||||||
|
|
||||||
import gc
|
import gc
|
||||||
|
import multiprocessing as mp
|
||||||
import os
|
import os
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from collections.abc import Callable, Generator
|
from collections.abc import Callable, Generator
|
||||||
|
|
@ -9,7 +10,11 @@ from dataclasses import dataclass, field
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
|
import antimeridian
|
||||||
|
import cudf
|
||||||
|
import cuml.cluster
|
||||||
import geopandas as gpd
|
import geopandas as gpd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import odc.geo.geobox
|
import odc.geo.geobox
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
@ -250,7 +255,18 @@ def _process_geom(poly: Polygon, unaligned: xr.Dataset | xr.DataArray, aggregati
|
||||||
return cell_data
|
return cell_data
|
||||||
|
|
||||||
|
|
||||||
def _partition_grid(grid_gdf: gpd.GeoDataFrame, n_partitions: int) -> Generator[gpd.GeoDataFrame]:
|
def partition_grid(grid_gdf: gpd.GeoDataFrame, n_partitions: int, plot: bool = False) -> Generator[gpd.GeoDataFrame]:
|
||||||
|
"""Partition the input GeoDataFrame into n_partitions parts.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
grid_gdf (gpd.GeoDataFrame): The input GeoDataFrame to partition.
|
||||||
|
n_partitions (int): The number of partitions.
|
||||||
|
plot (bool, optional): Whether to plot the partitions. Defaults to True.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Generator[gpd.GeoDataFrame]: Partitions of the input GeoDataFrame.
|
||||||
|
|
||||||
|
"""
|
||||||
if grid_gdf.crs.to_epsg() == 4326:
|
if grid_gdf.crs.to_epsg() == 4326:
|
||||||
crosses_antimeridian = grid_gdf.geometry.apply(_crosses_antimeridian)
|
crosses_antimeridian = grid_gdf.geometry.apply(_crosses_antimeridian)
|
||||||
else:
|
else:
|
||||||
|
|
@ -262,7 +278,24 @@ def _partition_grid(grid_gdf: gpd.GeoDataFrame, n_partitions: int) -> Generator[
|
||||||
|
|
||||||
# Simple partitioning by splitting the GeoDataFrame into n_partitions parts
|
# Simple partitioning by splitting the GeoDataFrame into n_partitions parts
|
||||||
centroids = pd.DataFrame({"x": grid_gdf.geometry.centroid.x, "y": grid_gdf.geometry.centroid.y})
|
centroids = pd.DataFrame({"x": grid_gdf.geometry.centroid.x, "y": grid_gdf.geometry.centroid.y})
|
||||||
labels = sklearn.cluster.KMeans(n_clusters=n_partitions, random_state=42).fit_predict(centroids)
|
|
||||||
|
# use cuml and cudf if len of centroids is larger than 100000
|
||||||
|
if len(centroids) > 100000:
|
||||||
|
print(f"Using cuML KMeans for partitioning {len(centroids)} centroids")
|
||||||
|
centroids_cudf = cudf.DataFrame.from_pandas(centroids)
|
||||||
|
kmeans = cuml.cluster.KMeans(n_clusters=n_partitions, random_state=42)
|
||||||
|
labels = kmeans.fit_predict(centroids_cudf).to_pandas().to_numpy()
|
||||||
|
else:
|
||||||
|
labels = sklearn.cluster.KMeans(n_clusters=n_partitions, random_state=42).fit_predict(centroids)
|
||||||
|
|
||||||
|
if plot:
|
||||||
|
grid_gdf = grid_gdf.copy()
|
||||||
|
grid_gdf["partition"] = labels
|
||||||
|
ax = grid_gdf.plot(column="partition", categorical=True, legend=True, figsize=(10, 10))
|
||||||
|
if crosses_antimeridian.any():
|
||||||
|
grid_gdf_am.plot(ax=ax, color="red", edgecolor="black", alpha=0.5)
|
||||||
|
ax.set_title("Grid partitions")
|
||||||
|
plt.show()
|
||||||
for i in range(n_partitions):
|
for i in range(n_partitions):
|
||||||
partition = grid_gdf[labels == i]
|
partition = grid_gdf[labels == i]
|
||||||
yield partition
|
yield partition
|
||||||
|
|
@ -292,19 +325,30 @@ class _MemoryProfiler:
|
||||||
|
|
||||||
|
|
||||||
memprof = None
|
memprof = None
|
||||||
|
shared_raster = None
|
||||||
|
|
||||||
|
|
||||||
def _init_worker():
|
def _init_worker(r: xr.Dataset | None):
|
||||||
global memprof
|
global memprof
|
||||||
|
global shared_raster
|
||||||
memprof = _MemoryProfiler()
|
memprof = _MemoryProfiler()
|
||||||
|
if r is not None:
|
||||||
|
# print("Initializing shared raster in worker")
|
||||||
|
shared_raster = r
|
||||||
|
|
||||||
|
|
||||||
def _align_partition(
|
def _align_partition(
|
||||||
grid_partition_gdf: gpd.GeoDataFrame,
|
grid_partition_gdf: gpd.GeoDataFrame,
|
||||||
raster: xr.Dataset | Callable[[], xr.Dataset],
|
raster: xr.Dataset | Callable[[], xr.Dataset] | None,
|
||||||
aggregations: _Aggregations,
|
aggregations: _Aggregations | None, # None -> Interpolation
|
||||||
pxbuffer: int,
|
pxbuffer: int,
|
||||||
):
|
):
|
||||||
|
# ? This function is expected to run inside a worker process
|
||||||
|
# It heavily utilizes different techniques to reduce memory usage such as
|
||||||
|
# Lazy operations, reading only necessary data, and cleaning up memory after use.
|
||||||
|
# Shared in-memory raster datasets are used when possible to avoid duplicating large datasets in memory.
|
||||||
|
# Shared raster datasets only work when using the "fork" start method for multiprocessing.
|
||||||
|
|
||||||
# Strategy for each cell:
|
# Strategy for each cell:
|
||||||
# 1. Correct the geometry to account for the antimeridian
|
# 1. Correct the geometry to account for the antimeridian
|
||||||
# 2. Cop the dataset and load the data into memory
|
# 2. Cop the dataset and load the data into memory
|
||||||
|
|
@ -328,122 +372,159 @@ def _align_partition(
|
||||||
|
|
||||||
memprof.log_memory("Before reading partial raster", log=False)
|
memprof.log_memory("Before reading partial raster", log=False)
|
||||||
|
|
||||||
if callable(raster) and not isinstance(raster, xr.Dataset):
|
need_to_close_raster = False
|
||||||
|
if raster is None:
|
||||||
|
# print("Using shared raster in worker")
|
||||||
|
raster = shared_raster
|
||||||
|
elif callable(raster) and not isinstance(raster, xr.Dataset):
|
||||||
|
# print("Loading raster in partition")
|
||||||
raster = raster()
|
raster = raster()
|
||||||
need_to_close_raster = True
|
need_to_close_raster = True
|
||||||
else:
|
# else:
|
||||||
need_to_close_raster = False
|
# print("Using provided raster in partition")
|
||||||
|
|
||||||
others_shape = tuple([raster.sizes[dim] for dim in raster.dims if dim not in ["y", "x", "latitude", "longitude"]])
|
if aggregations is None:
|
||||||
ongrid_shape = (len(grid_partition_gdf), len(raster.data_vars), len(aggregations), *others_shape)
|
cell_ids = grids.convert_cell_ids(grid_partition_gdf)
|
||||||
ongrid = np.full(ongrid_shape, np.nan, dtype=np.float32)
|
if grid_partition_gdf.crs.to_epsg() == 4326:
|
||||||
|
centroids = grid_partition_gdf.geometry.apply(antimeridian.fix_shape).apply(antimeridian.centroid)
|
||||||
partial_extent = odc.geo.BoundingBox(*grid_partition_gdf.total_bounds, crs=grid_partition_gdf.crs)
|
cx = centroids.apply(lambda p: p.x)
|
||||||
partial_extent = partial_extent.buffered(
|
cy = centroids.apply(lambda p: p.y)
|
||||||
raster.odc.geobox.resolution.x * pxbuffer,
|
else:
|
||||||
raster.odc.geobox.resolution.y * pxbuffer,
|
centroids = grid_partition_gdf.geometry.centroid
|
||||||
) # buffer by pxbuffer pixels
|
cx = centroids.x
|
||||||
with stopwatch("Cropping raster to partition extent", log=False):
|
cy = centroids.y
|
||||||
try:
|
interp_x = xr.DataArray(cx, dims=["cell_ids"], coords={"cell_ids": cell_ids})
|
||||||
partial_raster = raster.odc.crop(partial_extent, apply_mask=False).compute()
|
interp_y = xr.DataArray(cy, dims=["cell_ids"], coords={"cell_ids": cell_ids})
|
||||||
except Exception as e:
|
interp_coords = (
|
||||||
print(f"Error cropping raster to partition extent: {e}")
|
{"latitude": interp_y, "longitude": interp_x}
|
||||||
return ongrid
|
if "latitude" in raster.dims and "longitude" in raster.dims
|
||||||
|
else {"y": interp_y, "x": interp_x}
|
||||||
if partial_raster.nbytes / 1e9 > 20:
|
|
||||||
print(
|
|
||||||
f"{os.getpid()}: WARNING! Partial raster size is larger than 20GB:"
|
|
||||||
f" {partial_raster.nbytes / 1e9:.2f} GB ({len(grid_partition_gdf)} cells)."
|
|
||||||
f" This may lead to out-of-memory errors."
|
|
||||||
)
|
)
|
||||||
memprof.log_memory("After reading partial raster", log=False)
|
# ?: Cubic does not work with NaNs in xarray interp
|
||||||
|
with stopwatch("Interpolating data to grid centroids", log=False):
|
||||||
|
ongrid = raster.interp(interp_coords, method="linear", kwargs={"fill_value": np.nan})
|
||||||
|
memprof.log_memory("After interpolating data", log=False)
|
||||||
|
else:
|
||||||
|
partial_extent = odc.geo.BoundingBox(*grid_partition_gdf.total_bounds, crs=grid_partition_gdf.crs)
|
||||||
|
partial_extent = partial_extent.buffered(
|
||||||
|
raster.odc.geobox.resolution.x * pxbuffer,
|
||||||
|
raster.odc.geobox.resolution.y * pxbuffer,
|
||||||
|
) # buffer by pxbuffer pixels
|
||||||
|
with stopwatch("Cropping raster to partition extent", log=False):
|
||||||
|
try:
|
||||||
|
partial_raster: xr.Dataset = raster.odc.crop(partial_extent, apply_mask=False).compute()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error cropping raster to partition extent: {e}")
|
||||||
|
raise e
|
||||||
|
|
||||||
for i, (idx, row) in enumerate(grid_partition_gdf.iterrows()):
|
if partial_raster.nbytes / 1e9 > 20:
|
||||||
try:
|
print(
|
||||||
cell_data = _process_geom(row.geometry, partial_raster, aggregations)
|
f"{os.getpid()}: WARNING! Partial raster size is larger than 20GB:"
|
||||||
except (SystemError, SystemExit, KeyboardInterrupt) as e:
|
f" {partial_raster.nbytes / 1e9:.2f} GB ({len(grid_partition_gdf)} cells)."
|
||||||
raise e
|
f" This may lead to out-of-memory errors."
|
||||||
except Exception as e:
|
)
|
||||||
print(f"Error processing cell {row['cell_id']}: {e}")
|
memprof.log_memory("After reading partial raster", log=False)
|
||||||
continue
|
others_shape = tuple(
|
||||||
ongrid[i, ...] = cell_data
|
[raster.sizes[dim] for dim in raster.dims if dim not in ["y", "x", "latitude", "longitude"]]
|
||||||
|
)
|
||||||
|
ongrid_shape = (len(grid_partition_gdf), len(raster.data_vars), len(aggregations), *others_shape)
|
||||||
|
ongrid = np.full(ongrid_shape, np.nan, dtype=np.float32)
|
||||||
|
|
||||||
cell_ids = grids.convert_cell_ids(grid_partition_gdf)
|
for i, (idx, row) in enumerate(grid_partition_gdf.iterrows()):
|
||||||
dims = ["cell_ids", "variables", "aggregations"]
|
try:
|
||||||
coords = {"cell_ids": cell_ids, "variables": list(raster.data_vars), "aggregations": aggregations.aggnames()}
|
cell_data = _process_geom(row.geometry, partial_raster, aggregations)
|
||||||
for dim in set(raster.dims) - {"y", "x", "latitude", "longitude"}:
|
except (SystemError, SystemExit, KeyboardInterrupt) as e:
|
||||||
dims.append(dim)
|
raise e
|
||||||
coords[dim] = raster.coords[dim]
|
except Exception as e:
|
||||||
|
print(f"Error processing cell {row['cell_id']}: {e}")
|
||||||
|
continue
|
||||||
|
ongrid[i, ...] = cell_data
|
||||||
|
|
||||||
ongrid = xr.DataArray(ongrid, dims=dims, coords=coords).to_dataset("variables")
|
cell_ids = grids.convert_cell_ids(grid_partition_gdf)
|
||||||
|
dims = ["cell_ids", "variables", "aggregations"]
|
||||||
|
coords = {"cell_ids": cell_ids, "variables": list(raster.data_vars), "aggregations": aggregations.aggnames()}
|
||||||
|
for dim in set(raster.dims) - {"y", "x", "latitude", "longitude"}:
|
||||||
|
dims.append(dim)
|
||||||
|
coords[dim] = raster.coords[dim]
|
||||||
|
|
||||||
|
ongrid = xr.DataArray(ongrid, dims=dims, coords=coords).to_dataset("variables")
|
||||||
|
|
||||||
|
partial_raster.close()
|
||||||
|
del partial_raster
|
||||||
|
|
||||||
partial_raster.close()
|
|
||||||
del partial_raster
|
|
||||||
if need_to_close_raster:
|
if need_to_close_raster:
|
||||||
raster.close()
|
raster.close()
|
||||||
del raster
|
del raster
|
||||||
gc.collect()
|
gc.collect()
|
||||||
memprof.log_memory("After cleaning", log=False)
|
memprof.log_memory("After cleaning", log=False)
|
||||||
|
|
||||||
print("Finished processing partition")
|
# print("Finished processing partition")
|
||||||
print("### Stopwatch summary ###\n")
|
# print("### Stopwatch summary ###\n")
|
||||||
print(stopwatch.summary())
|
# print(stopwatch.summary())
|
||||||
print("### Memory summary ###\n")
|
# print("### Memory summary ###\n")
|
||||||
print(memprof.summary())
|
# print(memprof.summary())
|
||||||
print("#########################")
|
# print("#########################")
|
||||||
|
|
||||||
return ongrid
|
return ongrid
|
||||||
|
|
||||||
|
|
||||||
@stopwatch("Aligning data with grid")
|
@stopwatch("Aligning data with grid")
|
||||||
def _align_data(
|
def _align_data(
|
||||||
grid_gdf: gpd.GeoDataFrame,
|
grid_gdf: gpd.GeoDataFrame | list[gpd.GeoDataFrame],
|
||||||
raster: xr.Dataset | Callable[[], xr.Dataset],
|
raster: xr.Dataset | Callable[[], xr.Dataset],
|
||||||
aggregations: _Aggregations,
|
aggregations: _Aggregations | None,
|
||||||
n_partitions: int,
|
n_partitions: int | None,
|
||||||
concurrent_partitions: int,
|
concurrent_partitions: int,
|
||||||
pxbuffer: int,
|
pxbuffer: int,
|
||||||
):
|
):
|
||||||
partial_ongrids = []
|
partial_ongrids = []
|
||||||
|
|
||||||
_init_worker()
|
if isinstance(grid_gdf, list):
|
||||||
|
n_partitions = len(grid_gdf)
|
||||||
|
grid_partitions = grid_gdf
|
||||||
|
else:
|
||||||
|
grid_partitions = partition_grid(grid_gdf, n_partitions)
|
||||||
|
|
||||||
if n_partitions < concurrent_partitions:
|
if n_partitions < concurrent_partitions:
|
||||||
print(f"Adjusting concurrent_partitions from {concurrent_partitions} to {n_partitions}")
|
print(f"Adjusting concurrent_partitions from {concurrent_partitions} to {n_partitions}")
|
||||||
concurrent_partitions = n_partitions
|
concurrent_partitions = n_partitions
|
||||||
|
|
||||||
if concurrent_partitions <= 1:
|
if concurrent_partitions <= 1:
|
||||||
for i, grid_partition in enumerate(_partition_grid(grid_gdf, n_partitions)):
|
_init_worker(None) # No need to use a shared raster, since the processing is done in the main process
|
||||||
print(f"Processing partition {i + 1}/{n_partitions} with {len(grid_partition)} cells")
|
for i, grid_partition in enumerate(grid_partitions):
|
||||||
|
# print(f"Processing partition {i + 1}/{n_partitions} with {len(grid_partition)} cells")
|
||||||
part_ongrid = _align_partition(
|
part_ongrid = _align_partition(
|
||||||
grid_partition,
|
grid_partition,
|
||||||
raster, # .copy() if isinstance(raster, xr.Dataset) else raster,
|
raster,
|
||||||
aggregations,
|
aggregations,
|
||||||
pxbuffer,
|
pxbuffer,
|
||||||
)
|
)
|
||||||
partial_ongrids.append(part_ongrid)
|
partial_ongrids.append(part_ongrid)
|
||||||
else:
|
else:
|
||||||
|
# For mp start method fork, we can share the raster dataset between workers
|
||||||
|
if mp.get_start_method(allow_none=True) == "fork":
|
||||||
|
_init_worker(raster if isinstance(raster, xr.Dataset) else None)
|
||||||
|
initargs = (None,)
|
||||||
|
else:
|
||||||
|
# For spawn or forkserver, we need to copy the raster into each worker
|
||||||
|
initargs = (raster if isinstance(raster, xr.Dataset) else None,)
|
||||||
|
|
||||||
with ProcessPoolExecutor(
|
with ProcessPoolExecutor(
|
||||||
max_workers=concurrent_partitions,
|
max_workers=concurrent_partitions,
|
||||||
initializer=_init_worker,
|
initializer=_init_worker,
|
||||||
# initializer=_init_raster_global,
|
initargs=initargs,
|
||||||
# initargs=(raster,),
|
|
||||||
) as executor:
|
) as executor:
|
||||||
futures = {}
|
futures = {}
|
||||||
for i, grid_partition in enumerate(_partition_grid(grid_gdf, n_partitions)):
|
for i, grid_partition in enumerate(grid_partitions):
|
||||||
futures[
|
futures[
|
||||||
executor.submit(
|
executor.submit(
|
||||||
_align_partition,
|
_align_partition,
|
||||||
grid_partition,
|
grid_partition,
|
||||||
raster.copy() if isinstance(raster, xr.Dataset) else raster,
|
None if isinstance(raster, xr.Dataset) else raster,
|
||||||
aggregations,
|
aggregations,
|
||||||
pxbuffer,
|
pxbuffer,
|
||||||
)
|
)
|
||||||
] = i
|
] = i
|
||||||
if i == 6:
|
|
||||||
print("Breaking after 3 partitions for testing purposes")
|
|
||||||
|
|
||||||
print("Submitted all partitions, waiting for results...")
|
|
||||||
|
|
||||||
for future in track(
|
for future in track(
|
||||||
as_completed(futures),
|
as_completed(futures),
|
||||||
|
|
@ -452,7 +533,7 @@ def _align_data(
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
i = futures[future]
|
i = futures[future]
|
||||||
print(f"Processed partition {i + 1}/{len(futures)}")
|
# print(f"Processed partition {i + 1}/{len(futures)}")
|
||||||
part_ongrid = future.result()
|
part_ongrid = future.result()
|
||||||
partial_ongrids.append(part_ongrid)
|
partial_ongrids.append(part_ongrid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -465,11 +546,11 @@ def _align_data(
|
||||||
|
|
||||||
def aggregate_raster_into_grid(
|
def aggregate_raster_into_grid(
|
||||||
raster: xr.Dataset | Callable[[], xr.Dataset],
|
raster: xr.Dataset | Callable[[], xr.Dataset],
|
||||||
grid_gdf: gpd.GeoDataFrame,
|
grid_gdf: gpd.GeoDataFrame | list[gpd.GeoDataFrame],
|
||||||
aggregations: _Aggregations,
|
aggregations: _Aggregations | Literal["interpolate"],
|
||||||
grid: Literal["hex", "healpix"],
|
grid: Literal["hex", "healpix"],
|
||||||
level: int,
|
level: int,
|
||||||
n_partitions: int = 20,
|
n_partitions: int | None = 20,
|
||||||
concurrent_partitions: int = 5,
|
concurrent_partitions: int = 5,
|
||||||
pxbuffer: int = 15,
|
pxbuffer: int = 15,
|
||||||
):
|
):
|
||||||
|
|
@ -477,11 +558,13 @@ def aggregate_raster_into_grid(
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
raster (xr.Dataset | Callable[[], xr.Dataset]): Raster data or a function that returns it.
|
raster (xr.Dataset | Callable[[], xr.Dataset]): Raster data or a function that returns it.
|
||||||
grid_gdf (gpd.GeoDataFrame): The grid to aggregate into.
|
grid_gdf (gpd.GeoDataFrame | list[gpd.GeoDataFrame]): The grid to aggregate into.
|
||||||
aggregations (_Aggregations): The aggregations to perform.
|
If a list of GeoDataFrames is provided, each will be processed as a separate partition.
|
||||||
|
No further partitioning will be done and the n_partitions argument will be ignored.
|
||||||
|
aggregations (_Aggregations | Literal["interpolate"]): The aggregations to perform.
|
||||||
grid (Literal["hex", "healpix"]): The type of grid to use.
|
grid (Literal["hex", "healpix"]): The type of grid to use.
|
||||||
level (int): The level of the grid.
|
level (int): The level of the grid.
|
||||||
n_partitions (int, optional): Number of partitions to divide the grid into. Defaults to 20.
|
n_partitions (int | None, optional): Number of partitions to divide the grid into. Defaults to 20.
|
||||||
concurrent_partitions (int, optional): Maximum number of worker processes when processing partitions.
|
concurrent_partitions (int, optional): Maximum number of worker processes when processing partitions.
|
||||||
Defaults to 5.
|
Defaults to 5.
|
||||||
pxbuffer (int, optional): Pixel buffer around each grid cell. Defaults to 15.
|
pxbuffer (int, optional): Pixel buffer around each grid cell. Defaults to 15.
|
||||||
|
|
@ -493,7 +576,7 @@ def aggregate_raster_into_grid(
|
||||||
ongrid = _align_data(
|
ongrid = _align_data(
|
||||||
grid_gdf,
|
grid_gdf,
|
||||||
raster,
|
raster,
|
||||||
aggregations,
|
aggregations if aggregations != "interpolate" else None,
|
||||||
n_partitions=n_partitions,
|
n_partitions=n_partitions,
|
||||||
concurrent_partitions=concurrent_partitions,
|
concurrent_partitions=concurrent_partitions,
|
||||||
pxbuffer=pxbuffer,
|
pxbuffer=pxbuffer,
|
||||||
|
|
|
||||||
|
|
@ -87,10 +87,12 @@ import numpy as np
|
||||||
import odc.geo
|
import odc.geo
|
||||||
import odc.geo.xr
|
import odc.geo.xr
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import shapely.geometry
|
||||||
import ultraplot as uplt
|
import ultraplot as uplt
|
||||||
import xarray as xr
|
import xarray as xr
|
||||||
import xdggs
|
import xdggs
|
||||||
import xvec
|
import xvec
|
||||||
|
from rasterio.features import shapes
|
||||||
from rich import pretty, print, traceback
|
from rich import pretty, print, traceback
|
||||||
from stopuhr import stopwatch
|
from stopuhr import stopwatch
|
||||||
|
|
||||||
|
|
@ -643,6 +645,7 @@ def viz(
|
||||||
# ===========================
|
# ===========================
|
||||||
|
|
||||||
|
|
||||||
|
@stopwatch("Correcting longitudes to -180 to 180")
|
||||||
def _correct_longs(ds: xr.Dataset) -> xr.Dataset:
|
def _correct_longs(ds: xr.Dataset) -> xr.Dataset:
|
||||||
return ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180)).sortby("longitude")
|
return ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180)).sortby("longitude")
|
||||||
|
|
||||||
|
|
@ -651,6 +654,7 @@ def _correct_longs(ds: xr.Dataset) -> xr.Dataset:
|
||||||
def spatial_agg(
|
def spatial_agg(
|
||||||
grid: Literal["hex", "healpix"],
|
grid: Literal["hex", "healpix"],
|
||||||
level: int,
|
level: int,
|
||||||
|
concurrent_partitions: int = 20,
|
||||||
):
|
):
|
||||||
"""Perform spatial aggregation of ERA5 data to grid cells.
|
"""Perform spatial aggregation of ERA5 data to grid cells.
|
||||||
|
|
||||||
|
|
@ -661,6 +665,8 @@ def spatial_agg(
|
||||||
Args:
|
Args:
|
||||||
grid ("hex" | "healpix"): Grid type.
|
grid ("hex" | "healpix"): Grid type.
|
||||||
level (int): Grid resolution level.
|
level (int): Grid resolution level.
|
||||||
|
concurrent_partitions (int, optional): Number of concurrent partitions to process.
|
||||||
|
Defaults to 20.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with stopwatch(f"Loading {grid} grid at level {level}"):
|
with stopwatch(f"Loading {grid} grid at level {level}"):
|
||||||
|
|
@ -669,6 +675,23 @@ def spatial_agg(
|
||||||
grid_gdf = watermask.clip_grid(grid_gdf)
|
grid_gdf = watermask.clip_grid(grid_gdf)
|
||||||
grid_gdf = grid_gdf.to_crs("epsg:4326")
|
grid_gdf = grid_gdf.to_crs("epsg:4326")
|
||||||
|
|
||||||
|
aggregations = {
|
||||||
|
"hex": {
|
||||||
|
3: _Aggregations.common(),
|
||||||
|
4: _Aggregations.common(),
|
||||||
|
5: _Aggregations(mean=True),
|
||||||
|
6: "interpolate",
|
||||||
|
},
|
||||||
|
"healpix": {
|
||||||
|
6: _Aggregations.common(),
|
||||||
|
7: _Aggregations.common(),
|
||||||
|
8: _Aggregations.common(),
|
||||||
|
9: _Aggregations(mean=True),
|
||||||
|
10: "interpolate",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
aggregations = aggregations[grid][level]
|
||||||
|
|
||||||
for agg in ["yearly", "seasonal", "shoulder"]:
|
for agg in ["yearly", "seasonal", "shoulder"]:
|
||||||
unaligned_store = get_era5_stores(agg)
|
unaligned_store = get_era5_stores(agg)
|
||||||
with stopwatch(f"Loading {agg} ERA5 data"):
|
with stopwatch(f"Loading {agg} ERA5 data"):
|
||||||
|
|
@ -677,8 +700,26 @@ def spatial_agg(
|
||||||
assert unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {unaligned.odc.crs}"
|
assert unaligned.odc.crs == "epsg:4326", f"Expected CRS 'epsg:4326', got {unaligned.odc.crs}"
|
||||||
unaligned = _correct_longs(unaligned)
|
unaligned = _correct_longs(unaligned)
|
||||||
|
|
||||||
aggregations = _Aggregations.common()
|
# Filter out Grid Cells that are completely outside the ERA5 valid area
|
||||||
aggregated = aggregate_raster_into_grid(unaligned, grid_gdf, aggregations, grid, level)
|
valid_geoms = []
|
||||||
|
for g, v in shapes(
|
||||||
|
unaligned.t2m_mean.isel(time=0).isnull().astype("uint8").values,
|
||||||
|
transform=unaligned.odc.transform,
|
||||||
|
):
|
||||||
|
if v == 0:
|
||||||
|
valid_geoms.append(shapely.geometry.shape(g))
|
||||||
|
grid_gdf_filtered = grid_gdf[grid_gdf.geometry.intersects(shapely.geometry.MultiPolygon(valid_geoms))]
|
||||||
|
|
||||||
|
aggregated = aggregate_raster_into_grid(
|
||||||
|
unaligned,
|
||||||
|
grid_gdf_filtered,
|
||||||
|
aggregations,
|
||||||
|
grid,
|
||||||
|
level,
|
||||||
|
n_partitions=40,
|
||||||
|
concurrent_partitions=concurrent_partitions,
|
||||||
|
pxbuffer=10,
|
||||||
|
)
|
||||||
|
|
||||||
aggregated = aggregated.chunk({"cell_ids": min(len(aggregated.cell_ids), 10000), "time": len(aggregated.time)})
|
aggregated = aggregated.chunk({"cell_ids": min(len(aggregated.cell_ids), 10000), "time": len(aggregated.time)})
|
||||||
store = get_era5_stores(agg, grid, level)
|
store = get_era5_stores(agg, grid, level)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
"""Helpers for the watermask."""
|
"""Helpers for the watermask."""
|
||||||
|
|
||||||
|
import duckdb
|
||||||
import geopandas as gpd
|
import geopandas as gpd
|
||||||
|
|
||||||
from entropice.paths import watermask_file
|
from entropice.paths import watermask_file
|
||||||
|
|
@ -16,7 +17,7 @@ def open():
|
||||||
return watermask
|
return watermask
|
||||||
|
|
||||||
|
|
||||||
def clip_grid(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
def clip_grid(grid_gdf: gpd.GeoDataFrame, allow_duckdb: bool = False) -> gpd.GeoDataFrame:
|
||||||
"""Clip the input GeoDataFrame with the watermask.
|
"""Clip the input GeoDataFrame with the watermask.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -27,6 +28,71 @@ def clip_grid(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
watermask = open()
|
watermask = open()
|
||||||
watermask = watermask.to_crs(gdf.crs)
|
watermask = watermask.to_crs(grid_gdf.crs)
|
||||||
gdf = gdf.overlay(watermask, how="difference")
|
# ! Currently disabled - kernel crashes
|
||||||
return gdf
|
allow_duckdb = False
|
||||||
|
if len(grid_gdf) >= 10000 and allow_duckdb:
|
||||||
|
# Use duckdb for large datasets
|
||||||
|
crs = grid_gdf.crs
|
||||||
|
|
||||||
|
# Convert geometry columns to WKB format for DuckDB
|
||||||
|
# No need to copy the watermask
|
||||||
|
watermask["geometry"] = watermask.geometry.to_wkb()
|
||||||
|
grid_gdf = grid_gdf.copy()
|
||||||
|
grid_gdf["geometry"] = grid_gdf.geometry.to_wkb()
|
||||||
|
|
||||||
|
# Connect to DuckDB
|
||||||
|
con = duckdb.connect(":memory:")
|
||||||
|
|
||||||
|
# Install and load spatial extension
|
||||||
|
con.execute("INSTALL spatial;")
|
||||||
|
con.execute("LOAD spatial;")
|
||||||
|
|
||||||
|
# Register the DataFrames as tables in DuckDB
|
||||||
|
con.register("watermask", watermask)
|
||||||
|
con.register("grid", grid_gdf)
|
||||||
|
|
||||||
|
query = """
|
||||||
|
SELECT g.*
|
||||||
|
FROM grid g
|
||||||
|
WHERE NOT EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM watermask w
|
||||||
|
WHERE ST_Intersects(ST_GeomFromWKB(g.geometry), ST_GeomFromWKB(w.geometry))
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
query = """
|
||||||
|
WITH clipped AS (
|
||||||
|
SELECT
|
||||||
|
g.* EXCLUDE (geometry),
|
||||||
|
CASE
|
||||||
|
WHEN EXISTS (
|
||||||
|
SELECT 1
|
||||||
|
FROM watermask w
|
||||||
|
WHERE ST_Intersects(ST_GeomFromWKB(g.geometry), ST_GeomFromWKB(w.geometry))
|
||||||
|
)
|
||||||
|
THEN ST_Difference(
|
||||||
|
ST_GeomFromWKB(g.geometry),
|
||||||
|
(
|
||||||
|
SELECT ST_Union_Agg(ST_GeomFromWKB(w.geometry))
|
||||||
|
FROM watermask w
|
||||||
|
WHERE ST_Intersects(ST_GeomFromWKB(g.geometry), ST_GeomFromWKB(w.geometry))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ELSE ST_GeomFromWKB(g.geometry)
|
||||||
|
END AS geometry
|
||||||
|
FROM grid g
|
||||||
|
)
|
||||||
|
SELECT * FROM clipped
|
||||||
|
WHERE NOT ST_IsEmpty(geometry)
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = con.execute(query).df()
|
||||||
|
# Convert back to GeoDataFrame
|
||||||
|
result["geometry"] = gpd.GeoSeries.from_wkb(result["geometry"].apply(bytes))
|
||||||
|
grid_gdf = gpd.GeoDataFrame(result, geometry="geometry", crs=crs)
|
||||||
|
con.close()
|
||||||
|
else:
|
||||||
|
grid_gdf = grid_gdf.overlay(watermask, how="difference")
|
||||||
|
return grid_gdf
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue