entropice/steps/s1_0_alphaearth/alphaearth.py

"""Extract satellite embeddings from Google Earth Engine and map them to a grid."""

import os
import warnings
from pathlib import Path
from typing import Literal

import cyclopts
import ee
import geemap
import geopandas as gpd
import numpy as np
import pandas as pd
from rich import pretty, print, traceback
from rich.progress import track

# Filter out the GeoDataFrame.swapaxes deprecation warning
warnings.filterwarnings("ignore", message=".*GeoDataFrame.swapaxes.*", category=FutureWarning)

pretty.install()
traceback.install()
ee.Initialize(project="ee-tobias-hoelzer")

DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"
EMBEDDINGS_DIR = DATA_DIR / "embeddings"
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)


def cli(grid: Literal["hex", "healpix"], level: int, backup_intermediate: bool = False):
    """Extract satellite embeddings from Google Earth Engine and map them to a grid.

    Args:
        grid (Literal["hex", "healpix"]): The grid type to use.
        level (int): The grid level to use.
        backup_intermediate (bool, optional): Whether to backup intermediate results. Defaults to False.

    """
    gridname = f"permafrost_{grid}{level}"
    grid = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")

    for year in track(range(2017, 2025), total=8, description="Processing years..."):
        embedding_collection = ee.ImageCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL")
        embedding_collection = embedding_collection.filterDate(f"{year}-01-01", f"{year}-12-31")
        aggs = ["median", "stdDev", "min", "max", "mean", "p1", "p5", "p25", "p75", "p95", "p99"]
        bands = [f"A{str(i).zfill(2)}_{agg}" for i in range(64) for agg in aggs]

        def extract_embedding(feature):
            # Filter collection by geometry
            geom = feature.geometry()
            embedding = embedding_collection.filterBounds(geom).mosaic()
            # Get mean embedding value for the geometry
            mean_dict = embedding.reduceRegion(
                reducer=ee.Reducer.median()
                .combine(ee.Reducer.stdDev(), sharedInputs=True)
                .combine(ee.Reducer.minMax(), sharedInputs=True)
                .combine(ee.Reducer.mean(), sharedInputs=True)
                .combine(ee.Reducer.percentile([1, 5, 25, 75, 95, 99]), sharedInputs=True),
                geometry=geom,
            )
            # Add mean embedding values as properties to the feature
            return feature.set(mean_dict)

        # Process grid in batches of 100
        batch_size = 100
        all_results = []
        n_batches = len(grid) // batch_size
        for batch_num, batch_grid in track(
            enumerate(np.array_split(grid, n_batches)),
            description="Processing batches...",
            total=n_batches,
        ):
            # Convert batch to EE FeatureCollection
            eegrid_batch = ee.FeatureCollection(batch_grid.to_crs("epsg:4326").__geo_interface__)

            # Apply embedding extraction to batch
            eeegrid_batch = eegrid_batch.map(extract_embedding)
            df_batch = geemap.ee_to_df(eeegrid_batch)

            # Store batch results
            all_results.append(df_batch)

            # Save batch immediately to disk as backup
            if backup_intermediate:
                batch_filename = f"{gridname}_embeddings-{year}_batch{batch_num:06d}.parquet"
                batch_result = batch_grid.merge(df_batch[[*bands, "cell_id"]], on="cell_id", how="left")
                batch_result.to_parquet(EMBEDDINGS_DIR / f"{batch_filename}")

        # Combine all batch results
        df = pd.concat(all_results, ignore_index=True)
        embeddings_on_grid = grid.merge(df[[*bands, "cell_id"]], on="cell_id", how="left")
        embeddings_file = EMBEDDINGS_DIR / f"{gridname}_embeddings-{year}.parquet"
        embeddings_on_grid.to_parquet(embeddings_file)
        print(f"Saved embeddings for year {year} to {embeddings_file.resolve()}.")


def main():  # noqa: D103
    cyclopts.run(cli)


if __name__ == "__main__":
    main()
Mock for alphaearth download 2025-09-28 22:30:41 +02:00			`"""Extract satellite embeddings from Google Earth Engine and map them to a grid."""`

Make era5 and alphaearth downloads work 2025-10-01 14:44:24 +02:00			`import os`
Restructure to steps 2025-10-21 18:42:01 +02:00			`import warnings`
Mock for alphaearth download 2025-09-28 22:30:41 +02:00			`from pathlib import Path`
			`from typing import Literal`

			`import cyclopts`
			`import ee`
			`import geemap`
			`import geopandas as gpd`
Run alphaearth as embeddings and add era5 download via CDS 2025-09-29 18:45:57 +02:00			`import numpy as np`
			`import pandas as pd`
Restructure to steps 2025-10-21 18:42:01 +02:00			`from rich import pretty, print, traceback`
Run alphaearth as embeddings and add era5 download via CDS 2025-09-29 18:45:57 +02:00			`from rich.progress import track`
Mock for alphaearth download 2025-09-28 22:30:41 +02:00
Restructure to steps 2025-10-21 18:42:01 +02:00			`# Filter out the GeoDataFrame.swapaxes deprecation warning`
			`warnings.filterwarnings("ignore", message=".GeoDataFrame.swapaxes.", category=FutureWarning)`

Mock for alphaearth download 2025-09-28 22:30:41 +02:00			`pretty.install()`
			`traceback.install()`
			`ee.Initialize(project="ee-tobias-hoelzer")`

Restructure to steps 2025-10-21 18:42:01 +02:00			`DATA_DIR = Path(os.environ.get("DATA_DIR", "../../data")) / "entropyc-rts"`
Run alphaearth as embeddings and add era5 download via CDS 2025-09-29 18:45:57 +02:00			`EMBEDDINGS_DIR = DATA_DIR / "embeddings"`
			`EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)`
Mock for alphaearth download 2025-09-28 22:30:41 +02:00

Make era5 and alphaearth downloads work 2025-10-01 14:44:24 +02:00			`def cli(grid: Literal["hex", "healpix"], level: int, backup_intermediate: bool = False):`
Mock for alphaearth download 2025-09-28 22:30:41 +02:00			`"""Extract satellite embeddings from Google Earth Engine and map them to a grid.`

			`Args:`
			`grid (Literal["hex", "healpix"]): The grid type to use.`
			`level (int): The grid level to use.`
Make era5 and alphaearth downloads work 2025-10-01 14:44:24 +02:00			`backup_intermediate (bool, optional): Whether to backup intermediate results. Defaults to False.`
Mock for alphaearth download 2025-09-28 22:30:41 +02:00
			`"""`
Run alphaearth as embeddings and add era5 download via CDS 2025-09-29 18:45:57 +02:00			`gridname = f"permafrost_{grid}{level}"`
			`grid = gpd.read_parquet(DATA_DIR / f"grids/{gridname}_grid.parquet")`
Make era5 and alphaearth downloads work 2025-10-01 14:44:24 +02:00
Restructure to steps 2025-10-21 18:42:01 +02:00			`for year in track(range(2017, 2025), total=8, description="Processing years..."):`
Make era5 and alphaearth downloads work 2025-10-01 14:44:24 +02:00			`embedding_collection = ee.ImageCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL")`
			`embedding_collection = embedding_collection.filterDate(f"{year}-01-01", f"{year}-12-31")`
Restructure to steps 2025-10-21 18:42:01 +02:00			`aggs = ["median", "stdDev", "min", "max", "mean", "p1", "p5", "p25", "p75", "p95", "p99"]`
			`bands = [f"A{str(i).zfill(2)}_{agg}" for i in range(64) for agg in aggs]`
Make era5 and alphaearth downloads work 2025-10-01 14:44:24 +02:00
			`def extract_embedding(feature):`
			`# Filter collection by geometry`
			`geom = feature.geometry()`
			`embedding = embedding_collection.filterBounds(geom).mosaic()`
			`# Get mean embedding value for the geometry`
			`mean_dict = embedding.reduceRegion(`
Restructure to steps 2025-10-21 18:42:01 +02:00			`reducer=ee.Reducer.median()`
			`.combine(ee.Reducer.stdDev(), sharedInputs=True)`
			`.combine(ee.Reducer.minMax(), sharedInputs=True)`
			`.combine(ee.Reducer.mean(), sharedInputs=True)`
			`.combine(ee.Reducer.percentile([1, 5, 25, 75, 95, 99]), sharedInputs=True),`
Make era5 and alphaearth downloads work 2025-10-01 14:44:24 +02:00			`geometry=geom,`
			`)`
			`# Add mean embedding values as properties to the feature`
			`return feature.set(mean_dict)`

			`# Process grid in batches of 100`
			`batch_size = 100`
			`all_results = []`
			`n_batches = len(grid) // batch_size`
			`for batch_num, batch_grid in track(`
			`enumerate(np.array_split(grid, n_batches)),`
			`description="Processing batches...",`
			`total=n_batches,`
			`):`
			`# Convert batch to EE FeatureCollection`
			`eegrid_batch = ee.FeatureCollection(batch_grid.to_crs("epsg:4326").__geo_interface__)`

			`# Apply embedding extraction to batch`
			`eeegrid_batch = eegrid_batch.map(extract_embedding)`
			`df_batch = geemap.ee_to_df(eeegrid_batch)`

			`# Store batch results`
			`all_results.append(df_batch)`

			`# Save batch immediately to disk as backup`
			`if backup_intermediate:`
			`batch_filename = f"{gridname}_embeddings-{year}_batch{batch_num:06d}.parquet"`
			`batch_result = batch_grid.merge(df_batch[[*bands, "cell_id"]], on="cell_id", how="left")`
			`batch_result.to_parquet(EMBEDDINGS_DIR / f"{batch_filename}")`

			`# Combine all batch results`
			`df = pd.concat(all_results, ignore_index=True)`
			`embeddings_on_grid = grid.merge(df[[*bands, "cell_id"]], on="cell_id", how="left")`
Restructure to steps 2025-10-21 18:42:01 +02:00			`embeddings_file = EMBEDDINGS_DIR / f"{gridname}_embeddings-{year}.parquet"`
			`embeddings_on_grid.to_parquet(embeddings_file)`
			`print(f"Saved embeddings for year {year} to {embeddings_file.resolve()}.")`
Mock for alphaearth download 2025-09-28 22:30:41 +02:00

Restructure to steps 2025-10-21 18:42:01 +02:00			`def main(): # noqa: D103`
Mock for alphaearth download 2025-09-28 22:30:41 +02:00			`cyclopts.run(cli)`
Restructure to steps 2025-10-21 18:42:01 +02:00

			`if __name__ == "__main__":`
			`main()`