Refactor training script
This commit is contained in:
parent
ad5f810f34
commit
2cefe35690
4 changed files with 458 additions and 357 deletions
|
|
@ -149,6 +149,36 @@ class SplittedArrays:
|
||||||
train: torch.Tensor | np.ndarray | cp.ndarray
|
train: torch.Tensor | np.ndarray | cp.ndarray
|
||||||
test: torch.Tensor | np.ndarray | cp.ndarray
|
test: torch.Tensor | np.ndarray | cp.ndarray
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def combined(self) -> torch.Tensor | np.ndarray | cp.ndarray:
|
||||||
|
"""Combined train and test arrays."""
|
||||||
|
if isinstance(self.train, torch.Tensor) and isinstance(self.test, torch.Tensor):
|
||||||
|
return torch.cat([self.train, self.test], dim=0)
|
||||||
|
elif isinstance(self.train, cp.ndarray) and isinstance(self.test, cp.ndarray):
|
||||||
|
return cp.concatenate([self.train, self.test], axis=0)
|
||||||
|
elif isinstance(self.train, np.ndarray) and isinstance(self.test, np.ndarray):
|
||||||
|
return np.concatenate([self.train, self.test], axis=0)
|
||||||
|
else:
|
||||||
|
raise TypeError("Incompatible types for train and test arrays.")
|
||||||
|
|
||||||
|
def as_numpy(self) -> "SplittedArrays":
|
||||||
|
"""Return the arrays as numpy arrays."""
|
||||||
|
train_np = (
|
||||||
|
self.train.cpu().numpy()
|
||||||
|
if isinstance(self.train, torch.Tensor)
|
||||||
|
else self.train.get()
|
||||||
|
if isinstance(self.train, cp.ndarray)
|
||||||
|
else self.train
|
||||||
|
)
|
||||||
|
test_np = (
|
||||||
|
self.test.cpu().numpy()
|
||||||
|
if isinstance(self.test, torch.Tensor)
|
||||||
|
else self.test.get()
|
||||||
|
if isinstance(self.test, cp.ndarray)
|
||||||
|
else self.test
|
||||||
|
)
|
||||||
|
return SplittedArrays(train=train_np, test=test_np)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True, eq=False)
|
@dataclass(frozen=True, eq=False)
|
||||||
class TrainingSet:
|
class TrainingSet:
|
||||||
|
|
@ -185,12 +215,28 @@ class TrainingSet:
|
||||||
intervals.append((category_raw_values.min(), category_raw_values.max()))
|
intervals.append((category_raw_values.min(), category_raw_values.max()))
|
||||||
return intervals
|
return intervals
|
||||||
|
|
||||||
@cached_property
|
@property
|
||||||
def target_labels(self) -> list[str]:
|
def target_labels(self) -> list[str] | None:
|
||||||
"""Labels of the target categories."""
|
"""Labels of the target categories, None if not categorical."""
|
||||||
binned = self.targets["y"]
|
binned = self.targets["y"]
|
||||||
assert binned.dtype.name == "category", "Target labels are not categorical."
|
if binned.dtype.name == "category":
|
||||||
return list(binned.cat.categories)
|
return list(binned.cat.categories)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def target_codes(self) -> list[int] | None:
|
||||||
|
"""Label-Codes of the target categories, None if not categorical."""
|
||||||
|
binned = self.targets["y"]
|
||||||
|
if binned.dtype.name == "category":
|
||||||
|
return list(binned.cat.codes)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_names(self) -> list[str]:
|
||||||
|
"""Names of the features."""
|
||||||
|
return self.features.columns.tolist()
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.z)
|
return len(self.z)
|
||||||
|
|
@ -538,7 +584,7 @@ class DatasetEnsemble:
|
||||||
"none": No caching.
|
"none": No caching.
|
||||||
"read": Read from cache if exists, otherwise create and save to cache.
|
"read": Read from cache if exists, otherwise create and save to cache.
|
||||||
"overwrite": Always create and save to cache, overwriting existing cache.
|
"overwrite": Always create and save to cache, overwriting existing cache.
|
||||||
Defaults to "none".
|
Defaults to "read".
|
||||||
|
|
||||||
Yields:
|
Yields:
|
||||||
Generator[pd.DataFrame]: Generator yielding DataFrames with features for inference.
|
Generator[pd.DataFrame]: Generator yielding DataFrames with features for inference.
|
||||||
|
|
@ -572,7 +618,7 @@ class DatasetEnsemble:
|
||||||
"none": No caching.
|
"none": No caching.
|
||||||
"read": Read from cache if exists, otherwise create and save to cache.
|
"read": Read from cache if exists, otherwise create and save to cache.
|
||||||
"overwrite": Always create and save to cache, overwriting existing cache.
|
"overwrite": Always create and save to cache, overwriting existing cache.
|
||||||
Defaults to "none".
|
Defaults to "read".
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
pd.DataFrame: The training DataFrame.
|
pd.DataFrame: The training DataFrame.
|
||||||
|
|
@ -604,7 +650,7 @@ class DatasetEnsemble:
|
||||||
"none": No caching.
|
"none": No caching.
|
||||||
"read": Read from cache if exists, otherwise create and save to cache.
|
"read": Read from cache if exists, otherwise create and save to cache.
|
||||||
"overwrite": Always create and save to cache, overwriting existing cache.
|
"overwrite": Always create and save to cache, overwriting existing cache.
|
||||||
Defaults to "none".
|
Defaults to "read".
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
TrainingSet: The training set.
|
TrainingSet: The training set.
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,17 @@
|
||||||
# ruff: noqa: N806
|
# ruff: noqa: N806
|
||||||
"""Inference runs on trained models."""
|
"""Inference runs on trained models."""
|
||||||
|
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
import cupy as cp
|
import cupy as cp
|
||||||
import geopandas as gpd
|
import geopandas as gpd
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
from cuml.ensemble import RandomForestClassifier
|
|
||||||
from entropy import ESPAClassifier
|
|
||||||
from rich import pretty, traceback
|
from rich import pretty, traceback
|
||||||
from sklearn import set_config
|
from sklearn import set_config
|
||||||
from xgboost.sklearn import XGBClassifier
|
|
||||||
|
|
||||||
from entropice.ml.dataset import DatasetEnsemble
|
from entropice.ml.dataset import DatasetEnsemble
|
||||||
|
from entropice.ml.models import SupportedModel
|
||||||
|
|
||||||
traceback.install()
|
traceback.install()
|
||||||
pretty.install()
|
pretty.install()
|
||||||
|
|
@ -21,41 +21,40 @@ set_config(array_api_dispatch=True)
|
||||||
|
|
||||||
def predict_proba(
|
def predict_proba(
|
||||||
e: DatasetEnsemble,
|
e: DatasetEnsemble,
|
||||||
clf: RandomForestClassifier | ESPAClassifier | XGBClassifier,
|
model: SupportedModel,
|
||||||
classes: list,
|
device: Literal["cpu", "cuda", "torch"] = "cuda",
|
||||||
) -> gpd.GeoDataFrame:
|
) -> gpd.GeoDataFrame:
|
||||||
"""Get predicted probabilities for each cell.
|
"""Get predicted probabilities for each cell.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
e (DatasetEnsemble): The dataset ensemble configuration.
|
e (DatasetEnsemble): The dataset ensemble configuration.
|
||||||
clf (BaseEstimator): The trained classifier to use for predictions.
|
model: SupportedModel: The trained model to use for predictions.
|
||||||
classes (list): List of class names.
|
device (Literal["cpu", "cuda", "torch"]): The device to use for predictions.
|
||||||
|
This must match with the state of the model!
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: A list of predicted probabilities for each cell.
|
gpd.GeoDataFrame: A GeoDataFrame with cell_id, predicted probability, and geometry.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# Predict in batches to avoid memory issues
|
# Predict in batches to avoid memory issues
|
||||||
batch_size = 10000
|
batch_size = 50000
|
||||||
preds = []
|
preds = []
|
||||||
|
|
||||||
for batch in e.create_batches(batch_size=batch_size):
|
grid_gdf = e.read_grid()
|
||||||
cols_to_drop = ["geometry"]
|
for batch in e.create_inference_df(batch_size=batch_size):
|
||||||
if e.target == "darts_mllabels":
|
|
||||||
cols_to_drop += [col for col in batch.columns if col.startswith("dartsml_")]
|
|
||||||
else:
|
|
||||||
cols_to_drop += [col for col in batch.columns if col.startswith("darts_")]
|
|
||||||
X_batch = batch.drop(columns=cols_to_drop).dropna()
|
|
||||||
|
|
||||||
# Skip empty batches (all rows had NaN values)
|
# Skip empty batches (all rows had NaN values)
|
||||||
if len(X_batch) == 0:
|
if len(batch) == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
cell_ids = X_batch.index.to_numpy()
|
cell_ids = batch.index.to_numpy()
|
||||||
cell_geoms = batch.loc[X_batch.index, "geometry"].to_numpy()
|
cell_geoms = grid_gdf.loc[batch.index, "geometry"].to_numpy()
|
||||||
X_batch = X_batch.to_numpy(dtype="float64")
|
|
||||||
X_batch = torch.asarray(X_batch, device=0)
|
X_batch = batch.to_numpy(dtype="float64")
|
||||||
batch_preds = clf.predict(X_batch)
|
if device == "torch":
|
||||||
|
X_batch = torch.from_numpy(X_batch).to("cuda")
|
||||||
|
elif device == "cuda":
|
||||||
|
X_batch = cp.asarray(X_batch)
|
||||||
|
batch_preds = model.predict(X_batch)
|
||||||
if isinstance(batch_preds, cp.ndarray):
|
if isinstance(batch_preds, cp.ndarray):
|
||||||
batch_preds = batch_preds.get()
|
batch_preds = batch_preds.get()
|
||||||
elif torch.is_tensor(batch_preds):
|
elif torch.is_tensor(batch_preds):
|
||||||
|
|
@ -63,9 +62,9 @@ def predict_proba(
|
||||||
batch_preds = gpd.GeoDataFrame(
|
batch_preds = gpd.GeoDataFrame(
|
||||||
{
|
{
|
||||||
"cell_id": cell_ids,
|
"cell_id": cell_ids,
|
||||||
"predicted_class": [classes[i] for i in batch_preds],
|
"predicted": batch_preds,
|
||||||
"geometry": cell_geoms,
|
"geometry": cell_geoms,
|
||||||
},
|
},
|
||||||
).set_crs(epsg=3413, inplace=False)
|
).set_crs(grid_gdf.crs, inplace=False)
|
||||||
preds.append(batch_preds)
|
preds.append(batch_preds)
|
||||||
return gpd.GeoDataFrame(pd.concat(preds, ignore_index=True))
|
return gpd.GeoDataFrame(pd.concat(preds, ignore_index=True))
|
||||||
|
|
|
||||||
299
src/entropice/ml/models.py
Normal file
299
src/entropice/ml/models.py
Normal file
|
|
@ -0,0 +1,299 @@
|
||||||
|
"""Training helper to create and configure ML models with hyperparameter search."""
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import TypedDict
|
||||||
|
|
||||||
|
import scipy.stats
|
||||||
|
import xarray as xr
|
||||||
|
from cuml.ensemble import RandomForestClassifier, RandomForestRegressor
|
||||||
|
from cuml.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
||||||
|
from entropy import ESPAClassifier
|
||||||
|
from scipy.stats._distn_infrastructure import rv_continuous_frozen, rv_discrete_frozen
|
||||||
|
from xgboost.sklearn import XGBClassifier, XGBRegressor
|
||||||
|
|
||||||
|
from entropice.ml.dataset import TrainingSet
|
||||||
|
from entropice.utils.types import Task
|
||||||
|
|
||||||
|
|
||||||
|
class Distribution(TypedDict):
|
||||||
|
"""Distribution specification for hyperparameter optimization."""
|
||||||
|
|
||||||
|
distribution: str
|
||||||
|
low: float | int
|
||||||
|
high: float | int
|
||||||
|
|
||||||
|
|
||||||
|
type HPConfig = dict[str, Distribution | list]
|
||||||
|
type SupportedModel = (
|
||||||
|
ESPAClassifier
|
||||||
|
| XGBClassifier
|
||||||
|
| XGBRegressor
|
||||||
|
| RandomForestClassifier
|
||||||
|
| RandomForestRegressor
|
||||||
|
| KNeighborsClassifier
|
||||||
|
| KNeighborsRegressor
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ModelHPOConfig:
|
||||||
|
"""Model - Hyperparameter Optimization Configuration."""
|
||||||
|
|
||||||
|
model: SupportedModel
|
||||||
|
hp_config: HPConfig
|
||||||
|
fit_params: dict[str, object] = field(default_factory=dict)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def search_space(self) -> dict[str, list | rv_continuous_frozen | rv_discrete_frozen]:
|
||||||
|
"""Convert the HPConfig into a search space dictionary usable by sklearn's RandomizedSearchCV."""
|
||||||
|
search_space = {}
|
||||||
|
for key, dist in self.hp_config.items():
|
||||||
|
if isinstance(dist, list):
|
||||||
|
search_space[key] = dist
|
||||||
|
continue
|
||||||
|
assert hasattr(scipy.stats, dist["distribution"]), (
|
||||||
|
f"Unknown distribution type for {key}: {dist['distribution']}"
|
||||||
|
)
|
||||||
|
distfn = getattr(scipy.stats, dist["distribution"])
|
||||||
|
search_space[key] = distfn(dist["low"], dist["high"])
|
||||||
|
return search_space
|
||||||
|
|
||||||
|
|
||||||
|
# Hardcode Search Settings for now
|
||||||
|
espa_hpconfig: HPConfig = {
|
||||||
|
"eps_cl": {"distribution": "loguniform", "low": 1e-11, "high": 1e-6},
|
||||||
|
"eps_e": {"distribution": "loguniform", "low": 1e4, "high": 1e8},
|
||||||
|
"initial_K": {"distribution": "randint", "low": 400, "high": 800},
|
||||||
|
}
|
||||||
|
xgboost_hpconfig: HPConfig = {
|
||||||
|
"learning_rate": {"distribution": "loguniform", "low": 1e-3, "high": 1e-1},
|
||||||
|
"n_estimators": {"distribution": "randint", "low": 50, "high": 2000},
|
||||||
|
}
|
||||||
|
rf_hpconfig: HPConfig = {
|
||||||
|
"max_depth": {"distribution": "randint", "low": 5, "high": 50},
|
||||||
|
"n_estimators": {"distribution": "randint", "low": 50, "high": 1000},
|
||||||
|
}
|
||||||
|
knn_hpconfig: HPConfig = {
|
||||||
|
"n_neighbors": {"distribution": "randint", "low": 10, "high": 200},
|
||||||
|
"weights": ["uniform", "distance"], # Categorical parameter
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_hpo_config(model: str, task: Task, **model_kwargs) -> ModelHPOConfig:
|
||||||
|
"""Create a model and its hyperparameter optimization configuration based on the specified model type and task.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str): The type of model to create. Supported values are "espa", "xgboost", "rf", and "knn".
|
||||||
|
task (Task): The type of task to perform. Supported values are "classifier" and "regressor".
|
||||||
|
Kwargs:
|
||||||
|
model_kwargs: Additional keyword arguments to pass to the model constructor.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ModelHPOConfig: A dataclass containing the model instance, hyperparameter configuration, and any fit parameters.
|
||||||
|
|
||||||
|
"""
|
||||||
|
model_type = "classifier" if task in ("binary", "count_regimes", "density_regimes") else "regressor"
|
||||||
|
match (model, model_type):
|
||||||
|
case ("espa", "classifier"):
|
||||||
|
clf = ESPAClassifier(20, 0.1, 0.1, robust=True, **model_kwargs)
|
||||||
|
fit_params = {"max_iter": 300}
|
||||||
|
return ModelHPOConfig(clf, espa_hpconfig, fit_params)
|
||||||
|
case ("xgboost", "classifier"):
|
||||||
|
clf = XGBClassifier(
|
||||||
|
objective="multi:softprob" if task != "binary" else "binary:logistic",
|
||||||
|
eval_metric="mlogloss" if task != "binary" else "logloss",
|
||||||
|
tree_method="hist",
|
||||||
|
max_depth=10,
|
||||||
|
**model_kwargs,
|
||||||
|
)
|
||||||
|
return ModelHPOConfig(clf, xgboost_hpconfig)
|
||||||
|
case ("xgboost", "regressor"):
|
||||||
|
reg = XGBRegressor(
|
||||||
|
objective="reg:squarederror",
|
||||||
|
eval_metric="rmse",
|
||||||
|
tree_method="hist",
|
||||||
|
max_depth=10,
|
||||||
|
**model_kwargs,
|
||||||
|
)
|
||||||
|
return ModelHPOConfig(reg, xgboost_hpconfig)
|
||||||
|
case ("rf", "classifier"):
|
||||||
|
clf = RandomForestClassifier(split_criterion="entropy", **model_kwargs)
|
||||||
|
return ModelHPOConfig(clf, rf_hpconfig)
|
||||||
|
case ("rf", "regressor"):
|
||||||
|
reg = RandomForestRegressor(split_criterion="variance", **model_kwargs)
|
||||||
|
return ModelHPOConfig(reg, rf_hpconfig)
|
||||||
|
case ("knn", "classifier"):
|
||||||
|
clf = KNeighborsClassifier(**model_kwargs)
|
||||||
|
return ModelHPOConfig(clf, knn_hpconfig)
|
||||||
|
case ("knn", "regressor"):
|
||||||
|
reg = KNeighborsRegressor(**model_kwargs)
|
||||||
|
return ModelHPOConfig(reg, knn_hpconfig)
|
||||||
|
case _:
|
||||||
|
raise ValueError(f"Unsupported model/task combination: {model}/{task}")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_espa_feature_importance(model: ESPAClassifier, training_data: TrainingSet) -> xr.Dataset:
|
||||||
|
"""Extract the inner state of a trained ESPAClassifier as an xarray Dataset."""
|
||||||
|
# Annotate the state with xarray metadata
|
||||||
|
boxes = list(range(model.K_))
|
||||||
|
box_centers = xr.DataArray(
|
||||||
|
model.S_.cpu().numpy(),
|
||||||
|
dims=["feature", "box"],
|
||||||
|
coords={"feature": training_data.feature_names, "box": boxes},
|
||||||
|
name="box_centers",
|
||||||
|
attrs={"description": "Centers of the boxes in feature space."},
|
||||||
|
)
|
||||||
|
box_assignments = xr.DataArray(
|
||||||
|
model.Lambda_.cpu().numpy(),
|
||||||
|
dims=["class", "box"],
|
||||||
|
coords={"class": training_data.target_labels, "box": boxes},
|
||||||
|
name="box_assignments",
|
||||||
|
attrs={"description": "Assignments of samples to boxes."},
|
||||||
|
)
|
||||||
|
feature_weights = xr.DataArray(
|
||||||
|
model.W_.cpu().numpy(),
|
||||||
|
dims=["feature"],
|
||||||
|
coords={"feature": training_data.feature_names},
|
||||||
|
name="feature_weights",
|
||||||
|
attrs={"description": "Feature weights for each box."},
|
||||||
|
)
|
||||||
|
state = xr.Dataset(
|
||||||
|
{
|
||||||
|
"box_centers": box_centers,
|
||||||
|
"box_assignments": box_assignments,
|
||||||
|
"feature_weights": feature_weights,
|
||||||
|
},
|
||||||
|
attrs={
|
||||||
|
"description": "Inner state of the best ESPAClassifier from RandomizedSearchCV.",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return state
|
||||||
|
|
||||||
|
|
||||||
|
def extract_xgboost_feature_importance(model: XGBClassifier | XGBRegressor, training_data: TrainingSet) -> xr.Dataset:
|
||||||
|
"""Extract feature importance from a trained XGBoost model as an xarray Dataset."""
|
||||||
|
# Extract XGBoost-specific information
|
||||||
|
# Get the underlying booster
|
||||||
|
booster = model.get_booster()
|
||||||
|
|
||||||
|
# Feature importance with different importance types
|
||||||
|
# Note: get_score() returns dict with keys like 'f0', 'f1', etc. (feature indices)
|
||||||
|
importance_weight = booster.get_score(importance_type="weight")
|
||||||
|
importance_gain = booster.get_score(importance_type="gain")
|
||||||
|
importance_cover = booster.get_score(importance_type="cover")
|
||||||
|
importance_total_gain = booster.get_score(importance_type="total_gain")
|
||||||
|
importance_total_cover = booster.get_score(importance_type="total_cover")
|
||||||
|
|
||||||
|
# Create aligned arrays for all features (including zero-importance)
|
||||||
|
def align_importance(importance_dict, features):
|
||||||
|
"""Align importance dict to feature list, filling missing with 0.
|
||||||
|
|
||||||
|
XGBoost returns feature indices (f0, f1, ...) as keys, so we need to map them.
|
||||||
|
"""
|
||||||
|
return [importance_dict.get(f"f{i}", 0.0) for i in range(len(features))]
|
||||||
|
|
||||||
|
feature_importance_weight = xr.DataArray(
|
||||||
|
align_importance(importance_weight, training_data.feature_names),
|
||||||
|
dims=["feature"],
|
||||||
|
coords={"feature": training_data.feature_names},
|
||||||
|
name="feature_importance_weight",
|
||||||
|
attrs={"description": "Number of times a feature is used to split the data across all trees."},
|
||||||
|
)
|
||||||
|
feature_importance_gain = xr.DataArray(
|
||||||
|
align_importance(importance_gain, training_data.feature_names),
|
||||||
|
dims=["feature"],
|
||||||
|
coords={"feature": training_data.feature_names},
|
||||||
|
name="feature_importance_gain",
|
||||||
|
attrs={"description": "Average gain across all splits the feature is used in."},
|
||||||
|
)
|
||||||
|
feature_importance_cover = xr.DataArray(
|
||||||
|
align_importance(importance_cover, training_data.feature_names),
|
||||||
|
dims=["feature"],
|
||||||
|
coords={"feature": training_data.feature_names},
|
||||||
|
name="feature_importance_cover",
|
||||||
|
attrs={"description": "Average coverage across all splits the feature is used in."},
|
||||||
|
)
|
||||||
|
feature_importance_total_gain = xr.DataArray(
|
||||||
|
align_importance(importance_total_gain, training_data.feature_names),
|
||||||
|
dims=["feature"],
|
||||||
|
coords={"feature": training_data.feature_names},
|
||||||
|
name="feature_importance_total_gain",
|
||||||
|
attrs={"description": "Total gain across all splits the feature is used in."},
|
||||||
|
)
|
||||||
|
feature_importance_total_cover = xr.DataArray(
|
||||||
|
align_importance(importance_total_cover, training_data.feature_names),
|
||||||
|
dims=["feature"],
|
||||||
|
coords={"feature": training_data.feature_names},
|
||||||
|
name="feature_importance_total_cover",
|
||||||
|
attrs={"description": "Total coverage across all splits the feature is used in."},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store tree information
|
||||||
|
n_trees = booster.num_boosted_rounds()
|
||||||
|
|
||||||
|
state = xr.Dataset(
|
||||||
|
{
|
||||||
|
"feature_importance_weight": feature_importance_weight,
|
||||||
|
"feature_importance_gain": feature_importance_gain,
|
||||||
|
"feature_importance_cover": feature_importance_cover,
|
||||||
|
"feature_importance_total_gain": feature_importance_total_gain,
|
||||||
|
"feature_importance_total_cover": feature_importance_total_cover,
|
||||||
|
},
|
||||||
|
attrs={
|
||||||
|
"description": "Inner state of the best XGBClassifier from RandomizedSearchCV.",
|
||||||
|
"n_trees": n_trees,
|
||||||
|
"objective": str(model.objective),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return state
|
||||||
|
|
||||||
|
|
||||||
|
def extract_rf_feature_importance(
|
||||||
|
model: RandomForestClassifier | RandomForestRegressor, training_data: TrainingSet
|
||||||
|
) -> xr.Dataset:
|
||||||
|
"""Extract feature importance from a trained RandomForest model as an xarray Dataset."""
|
||||||
|
# Extract Random Forest-specific information
|
||||||
|
# Note: cuML's RandomForestClassifier doesn't expose individual trees (estimators_)
|
||||||
|
# like sklearn does, so we can only extract feature importances and model parameters
|
||||||
|
|
||||||
|
# Feature importances (Gini importance)
|
||||||
|
feature_importances = model.feature_importances_
|
||||||
|
|
||||||
|
feature_importance = xr.DataArray(
|
||||||
|
feature_importances,
|
||||||
|
dims=["feature"],
|
||||||
|
coords={"feature": training_data.feature_names},
|
||||||
|
name="feature_importance",
|
||||||
|
attrs={"description": "Gini importance (impurity-based feature importance)."},
|
||||||
|
)
|
||||||
|
|
||||||
|
# cuML RF doesn't expose individual trees, so we store model parameters instead
|
||||||
|
n_estimators = model.n_estimators
|
||||||
|
max_depth = model.max_depth
|
||||||
|
|
||||||
|
# OOB score if available
|
||||||
|
oob_score = None
|
||||||
|
if hasattr(model, "oob_score_") and model.oob_score:
|
||||||
|
oob_score = float(model.oob_score_)
|
||||||
|
|
||||||
|
# cuML RandomForest doesn't provide per-tree statistics like sklearn
|
||||||
|
# Store what we have: feature importances and model configuration
|
||||||
|
attrs = {
|
||||||
|
"description": "Inner state of the best RandomForestClassifier from RandomizedSearchCV (cuML).",
|
||||||
|
"n_estimators": int(n_estimators),
|
||||||
|
"note": "cuML RandomForest does not expose individual tree statistics like sklearn",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Only add optional attributes if they have values
|
||||||
|
if max_depth is not None:
|
||||||
|
attrs["max_depth"] = int(max_depth)
|
||||||
|
if oob_score is not None:
|
||||||
|
attrs["oob_score"] = oob_score
|
||||||
|
|
||||||
|
state = xr.Dataset(
|
||||||
|
{
|
||||||
|
"feature_importance": feature_importance,
|
||||||
|
},
|
||||||
|
attrs=attrs,
|
||||||
|
)
|
||||||
|
return state
|
||||||
|
|
@ -4,42 +4,31 @@ import pickle
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import cupy as cp
|
|
||||||
import cyclopts
|
import cyclopts
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import toml
|
import toml
|
||||||
import torch
|
|
||||||
import xarray as xr
|
import xarray as xr
|
||||||
from cuml.ensemble import RandomForestClassifier
|
|
||||||
from cuml.neighbors import KNeighborsClassifier
|
|
||||||
from entropy import ESPAClassifier
|
|
||||||
from rich import pretty, traceback
|
from rich import pretty, traceback
|
||||||
from scipy.stats import loguniform, randint
|
|
||||||
from scipy.stats._distn_infrastructure import rv_continuous_frozen, rv_discrete_frozen
|
|
||||||
from sklearn import set_config
|
from sklearn import set_config
|
||||||
from sklearn.metrics import (
|
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, jaccard_score, precision_score, recall_score
|
||||||
accuracy_score,
|
|
||||||
confusion_matrix,
|
|
||||||
f1_score,
|
|
||||||
jaccard_score,
|
|
||||||
precision_score,
|
|
||||||
recall_score,
|
|
||||||
)
|
|
||||||
from sklearn.model_selection import KFold, RandomizedSearchCV
|
from sklearn.model_selection import KFold, RandomizedSearchCV
|
||||||
from stopuhr import stopwatch
|
from stopuhr import stopwatch
|
||||||
from xgboost.sklearn import XGBClassifier
|
|
||||||
|
|
||||||
from entropice.ml.dataset import DatasetEnsemble
|
from entropice.ml.dataset import DatasetEnsemble, SplittedArrays
|
||||||
from entropice.ml.inference import predict_proba
|
from entropice.ml.inference import predict_proba
|
||||||
|
from entropice.ml.models import (
|
||||||
|
extract_espa_feature_importance,
|
||||||
|
extract_rf_feature_importance,
|
||||||
|
extract_xgboost_feature_importance,
|
||||||
|
get_model_hpo_config,
|
||||||
|
)
|
||||||
from entropice.utils.paths import get_cv_results_dir
|
from entropice.utils.paths import get_cv_results_dir
|
||||||
from entropice.utils.types import Model, Task
|
from entropice.utils.types import Model, Task
|
||||||
|
|
||||||
traceback.install()
|
traceback.install()
|
||||||
pretty.install()
|
pretty.install()
|
||||||
|
|
||||||
# Disabled array_api_dispatch to avoid namespace conflicts between NumPy and CuPy
|
|
||||||
# when using XGBoost with device="cuda"
|
|
||||||
set_config(array_api_dispatch=True)
|
|
||||||
|
|
||||||
cli = cyclopts.App("entropice-training", config=cyclopts.config.Toml("training-config.toml")) # ty:ignore[invalid-argument-type]
|
cli = cyclopts.App("entropice-training", config=cyclopts.config.Toml("training-config.toml")) # ty:ignore[invalid-argument-type]
|
||||||
|
|
||||||
|
|
@ -82,93 +71,21 @@ _metric_functions = {
|
||||||
@cyclopts.Parameter("*")
|
@cyclopts.Parameter("*")
|
||||||
@dataclass(frozen=True, kw_only=True)
|
@dataclass(frozen=True, kw_only=True)
|
||||||
class CVSettings:
|
class CVSettings:
|
||||||
|
"""Cross-validation settings for model training."""
|
||||||
|
|
||||||
n_iter: int = 2000
|
n_iter: int = 2000
|
||||||
robust: bool = False
|
|
||||||
task: Task = "binary"
|
task: Task = "binary"
|
||||||
model: Model = "espa"
|
model: Model = "espa"
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True, kw_only=True)
|
@dataclass(frozen=True, kw_only=True)
|
||||||
class TrainingSettings(DatasetEnsemble, CVSettings):
|
class TrainingSettings(DatasetEnsemble, CVSettings):
|
||||||
|
"""Helper Wrapper to store combined training and dataset ensemble settings."""
|
||||||
|
|
||||||
param_grid: dict
|
param_grid: dict
|
||||||
cv_splits: int
|
cv_splits: int
|
||||||
metrics: list[str]
|
metrics: list[str]
|
||||||
classes: list[str]
|
classes: list[str] | None
|
||||||
|
|
||||||
|
|
||||||
def _create_clf(
|
|
||||||
settings: CVSettings,
|
|
||||||
):
|
|
||||||
if settings.model == "espa":
|
|
||||||
if settings.task == "binary":
|
|
||||||
param_grid = {
|
|
||||||
"eps_cl": loguniform(1e-4, 1e1),
|
|
||||||
"eps_e": loguniform(1e1, 1e7),
|
|
||||||
"initial_K": randint(20, 400),
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
param_grid = {
|
|
||||||
"eps_cl": loguniform(1e-11, 1e-6),
|
|
||||||
"eps_e": loguniform(1e4, 1e8),
|
|
||||||
"initial_K": randint(400, 800),
|
|
||||||
}
|
|
||||||
clf = ESPAClassifier(20, 0.1, 0.1, random_state=42, robust=settings.robust)
|
|
||||||
fit_params = {"max_iter": 300}
|
|
||||||
elif settings.model == "xgboost":
|
|
||||||
param_grid = {
|
|
||||||
"learning_rate": loguniform(1e-4, 1e-1),
|
|
||||||
"max_depth": randint(5, 50),
|
|
||||||
"n_estimators": randint(50, 1000),
|
|
||||||
}
|
|
||||||
clf = XGBClassifier(
|
|
||||||
objective="multi:softprob" if settings.task != "binary" else "binary:logistic",
|
|
||||||
eval_metric="mlogloss" if settings.task != "binary" else "logloss",
|
|
||||||
random_state=42,
|
|
||||||
tree_method="hist",
|
|
||||||
device="gpu", # Using CPU to avoid CuPy/NumPy namespace conflicts in sklearn metrics
|
|
||||||
)
|
|
||||||
fit_params = {}
|
|
||||||
elif settings.model == "rf":
|
|
||||||
param_grid = {
|
|
||||||
"max_depth": randint(5, 50),
|
|
||||||
"n_estimators": randint(50, 1000),
|
|
||||||
}
|
|
||||||
clf = RandomForestClassifier(random_state=42, split_criterion="entropy")
|
|
||||||
fit_params = {}
|
|
||||||
elif settings.model == "knn":
|
|
||||||
param_grid = {
|
|
||||||
"n_neighbors": randint(10, 200),
|
|
||||||
"weights": ["uniform", "distance"],
|
|
||||||
}
|
|
||||||
clf = KNeighborsClassifier()
|
|
||||||
fit_params = {}
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown model: {settings.model}")
|
|
||||||
|
|
||||||
return clf, param_grid, fit_params
|
|
||||||
|
|
||||||
|
|
||||||
def _serialize_param_grid(param_grid):
|
|
||||||
param_grid_serializable = {}
|
|
||||||
for key, dist in param_grid.items():
|
|
||||||
# ! Hacky, but I can't find a better way to serialize scipy distributions once they are created
|
|
||||||
if isinstance(dist, rv_continuous_frozen):
|
|
||||||
param_grid_serializable[key] = {
|
|
||||||
"distribution": "loguniform",
|
|
||||||
"low": dist.a,
|
|
||||||
"high": dist.b,
|
|
||||||
}
|
|
||||||
elif isinstance(dist, rv_discrete_frozen):
|
|
||||||
param_grid_serializable[key] = {
|
|
||||||
"distribution": "randint",
|
|
||||||
"low": dist.a,
|
|
||||||
"high": dist.b,
|
|
||||||
}
|
|
||||||
elif isinstance(dist, list):
|
|
||||||
param_grid_serializable[key] = dist
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown distribution type for {key}: {type(dist)}")
|
|
||||||
return param_grid_serializable
|
|
||||||
|
|
||||||
|
|
||||||
@cli.default
|
@cli.default
|
||||||
|
|
@ -179,24 +96,24 @@ def random_cv(
|
||||||
"""Perform random cross-validation on the training dataset.
|
"""Perform random cross-validation on the training dataset.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
grid (Grid): The grid type to use.
|
dataset_ensemble (DatasetEnsemble): The dataset ensemble configuration.
|
||||||
level (int): The grid level to use.
|
settings (CVSettings): The cross-validation settings.
|
||||||
n_iter (int, optional): Number of parameter settings that are sampled. Defaults to 2000.
|
|
||||||
robust (bool, optional): Whether to use robust training. Defaults to False.
|
|
||||||
task (Literal["binary", "count", "density"], optional): The classification task type. Defaults to "binary".
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
device = "torch" if settings.model in ["espa"] else "cuda"
|
# Since we use cuml and xgboost libraries, we can only enable array API for ESPA
|
||||||
print("Creating training data...")
|
use_array_api = settings.model == "espa"
|
||||||
training_data = dataset_ensemble.create_cat_training_dataset(task=settings.task, device=device)
|
device = "torch" if use_array_api else "cuda"
|
||||||
|
set_config(array_api_dispatch=use_array_api)
|
||||||
|
|
||||||
clf, param_grid, fit_params = _create_clf(settings)
|
print("Creating training data...")
|
||||||
print(f"Using model: {settings.model} with parameters: {param_grid}")
|
training_data = dataset_ensemble.create_training_set(task=settings.task, device=device)
|
||||||
|
model_hpo_config = get_model_hpo_config(settings.model, settings.task)
|
||||||
|
print(f"Using model: {settings.model} with parameters: {model_hpo_config.hp_config}")
|
||||||
cv = KFold(n_splits=5, shuffle=True, random_state=42)
|
cv = KFold(n_splits=5, shuffle=True, random_state=42)
|
||||||
metrics = _metrics["binary" if settings.task == "binary" else "multiclass"]
|
metrics = _metrics["binary" if settings.task == "binary" else "multiclass"]
|
||||||
search = RandomizedSearchCV(
|
search = RandomizedSearchCV(
|
||||||
clf,
|
model_hpo_config.model,
|
||||||
param_grid,
|
model_hpo_config.search_space,
|
||||||
n_iter=settings.n_iter,
|
n_iter=settings.n_iter,
|
||||||
n_jobs=1,
|
n_jobs=1,
|
||||||
cv=cv,
|
cv=cv,
|
||||||
|
|
@ -208,47 +125,27 @@ def random_cv(
|
||||||
|
|
||||||
print(f"Starting RandomizedSearchCV with {search.n_iter} candidates...")
|
print(f"Starting RandomizedSearchCV with {search.n_iter} candidates...")
|
||||||
with stopwatch(f"RandomizedSearchCV fitting for {search.n_iter} candidates"):
|
with stopwatch(f"RandomizedSearchCV fitting for {search.n_iter} candidates"):
|
||||||
y_train = (
|
search.fit(
|
||||||
training_data.y.train.get()
|
training_data.X.train,
|
||||||
if settings.model == "xgboost" and isinstance(training_data.y.train, cp.ndarray)
|
# XGBoost returns it's labels as numpy arrays instead of cupy arrays
|
||||||
else training_data.y.train
|
# Thus, for the scoring to work, we need to convert them back to numpy
|
||||||
|
training_data.y.as_numpy().train if settings.model == "xgboost" else training_data.y.train,
|
||||||
|
**model_hpo_config.fit_params,
|
||||||
)
|
)
|
||||||
search.fit(training_data.X.train, y_train, **fit_params)
|
|
||||||
|
|
||||||
print("Best parameters combination found:")
|
print("Best parameters combination found:")
|
||||||
best_estimator = search.best_estimator_
|
best_estimator = search.best_estimator_
|
||||||
best_parameters = best_estimator.get_params()
|
best_parameters = best_estimator.get_params()
|
||||||
for param_name in sorted(param_grid.keys()):
|
for param_name in sorted(model_hpo_config.hp_config.keys()):
|
||||||
print(f"{param_name}: {best_parameters[param_name]}")
|
print(f"{param_name}: {best_parameters[param_name]}")
|
||||||
|
|
||||||
y_test = (
|
test_accuracy = search.score(
|
||||||
training_data.y.test.get()
|
training_data.X.test,
|
||||||
if settings.model == "xgboost" and isinstance(training_data.y.test, cp.ndarray)
|
training_data.y.as_numpy().test if settings.model == "xgboost" else training_data.y.test,
|
||||||
else training_data.y.test
|
|
||||||
)
|
)
|
||||||
test_accuracy = search.score(training_data.X.test, y_test)
|
|
||||||
print(f"Accuracy of the best parameters using the inner CV of the random search: {search.best_score_:.3f}")
|
print(f"Accuracy of the best parameters using the inner CV of the random search: {search.best_score_:.3f}")
|
||||||
print(f"Accuracy on test set: {test_accuracy:.3f}")
|
print(f"Accuracy on test set: {test_accuracy:.3f}")
|
||||||
|
|
||||||
# Compute predictions on the test set
|
|
||||||
y_pred = best_estimator.predict(training_data.X.test)
|
|
||||||
labels = list(range(len(training_data.y.labels)))
|
|
||||||
y_test = torch.asarray(y_test, device="cuda")
|
|
||||||
y_pred = torch.asarray(y_pred, device="cuda")
|
|
||||||
labels = torch.asarray(labels, device="cuda")
|
|
||||||
|
|
||||||
test_metrics = {metric: _metric_functions[metric](y_test, y_pred) for metric in metrics}
|
|
||||||
|
|
||||||
# Get a confusion matrix
|
|
||||||
cm = confusion_matrix(y_test, y_pred, labels=labels)
|
|
||||||
label_names = [training_data.y.labels[i] for i in range(len(training_data.y.labels))]
|
|
||||||
cm = xr.DataArray(
|
|
||||||
cm.cpu().numpy(),
|
|
||||||
dims=["true_label", "predicted_label"],
|
|
||||||
coords={"true_label": label_names, "predicted_label": label_names},
|
|
||||||
name="confusion_matrix",
|
|
||||||
)
|
|
||||||
|
|
||||||
results_dir = get_cv_results_dir(
|
results_dir = get_cv_results_dir(
|
||||||
"random_search",
|
"random_search",
|
||||||
grid=dataset_ensemble.grid,
|
grid=dataset_ensemble.grid,
|
||||||
|
|
@ -257,15 +154,13 @@ def random_cv(
|
||||||
)
|
)
|
||||||
|
|
||||||
# Store the search settings
|
# Store the search settings
|
||||||
# First convert the param_grid distributions to a serializable format
|
|
||||||
param_grid_serializable = _serialize_param_grid(param_grid)
|
|
||||||
combined_settings = TrainingSettings(
|
combined_settings = TrainingSettings(
|
||||||
**asdict(settings),
|
**asdict(settings),
|
||||||
**asdict(dataset_ensemble),
|
**asdict(dataset_ensemble),
|
||||||
param_grid=param_grid_serializable,
|
param_grid=model_hpo_config.hp_config,
|
||||||
cv_splits=cv.get_n_splits(),
|
cv_splits=cv.get_n_splits(),
|
||||||
metrics=metrics,
|
metrics=metrics,
|
||||||
classes=training_data.y.labels,
|
classes=training_data.target_labels,
|
||||||
)
|
)
|
||||||
settings_file = results_dir / "search_settings.toml"
|
settings_file = results_dir / "search_settings.toml"
|
||||||
print(f"Storing search settings to {settings_file}")
|
print(f"Storing search settings to {settings_file}")
|
||||||
|
|
@ -288,209 +183,71 @@ def random_cv(
|
||||||
print(f"Storing CV results to {results_file}")
|
print(f"Storing CV results to {results_file}")
|
||||||
results.to_parquet(results_file)
|
results.to_parquet(results_file)
|
||||||
|
|
||||||
# Store the test metrics
|
# Compute predictions on the all sets and move them to numpy for metric computations
|
||||||
test_metrics_file = results_dir / "test_metrics.toml"
|
y_pred = SplittedArrays(
|
||||||
|
train=best_estimator.predict(training_data.X.train),
|
||||||
|
test=best_estimator.predict(training_data.X.test),
|
||||||
|
).as_numpy()
|
||||||
|
|
||||||
|
# Compute and StoreMetrics
|
||||||
|
y = training_data.y.as_numpy()
|
||||||
|
test_metrics = {metric: _metric_functions[metric](y.test, y_pred.test) for metric in metrics}
|
||||||
|
train_metrics = {metric: _metric_functions[metric](y.train, y_pred.train) for metric in metrics}
|
||||||
|
combined_metrics = {metric: _metric_functions[metric](y.combined, y_pred.combined) for metric in metrics}
|
||||||
|
all_metrics = {
|
||||||
|
"test_metrics": test_metrics,
|
||||||
|
"train_metrics": train_metrics,
|
||||||
|
"combined_metrics": combined_metrics,
|
||||||
|
}
|
||||||
|
test_metrics_file = results_dir / "metrics.toml"
|
||||||
print(f"Storing test metrics to {test_metrics_file}")
|
print(f"Storing test metrics to {test_metrics_file}")
|
||||||
with open(test_metrics_file, "w") as f:
|
with open(test_metrics_file, "w") as f:
|
||||||
toml.dump({"test_metrics": test_metrics}, f)
|
toml.dump(all_metrics, f)
|
||||||
|
|
||||||
# Store the confusion matrix
|
# Make confusion matrices for classification taasks
|
||||||
cm_file = results_dir / "confusion_matrix.nc"
|
if settings.task in ["binary", "count_regimes", "density_regimes"]:
|
||||||
print(f"Storing confusion matrix to {cm_file}")
|
labels = np.array(training_data.target_codes)
|
||||||
cm.to_netcdf(cm_file, engine="h5netcdf")
|
cm = xr.Dataset(
|
||||||
|
{
|
||||||
|
"test": (("true_label", "predicted_label"), confusion_matrix(y.test, y_pred.test, labels=labels)),
|
||||||
|
"train": (("true_label", "predicted_label"), confusion_matrix(y.train, y_pred.train, labels=labels)),
|
||||||
|
"combined": (
|
||||||
|
("true_label", "predicted_label"),
|
||||||
|
confusion_matrix(y.combined, y_pred.combined, labels=labels),
|
||||||
|
),
|
||||||
|
},
|
||||||
|
coords={"true_label": training_data.target_labels, "predicted_label": training_data.target_labels},
|
||||||
|
)
|
||||||
|
# Store the confusion matrices
|
||||||
|
cm_file = results_dir / "confusion_matrix.nc"
|
||||||
|
print(f"Storing confusion matrices to {cm_file}")
|
||||||
|
cm.to_netcdf(cm_file, engine="h5netcdf")
|
||||||
|
|
||||||
# Get the inner state of the best estimator
|
# Get the inner state of the best estimator
|
||||||
features = training_data.X.data.columns.tolist()
|
|
||||||
|
|
||||||
if settings.model == "espa":
|
if settings.model == "espa":
|
||||||
# Annotate the state with xarray metadata
|
state = extract_espa_feature_importance(best_estimator, training_data)
|
||||||
boxes = list(range(best_estimator.K_))
|
|
||||||
box_centers = xr.DataArray(
|
|
||||||
best_estimator.S_.cpu().numpy(),
|
|
||||||
dims=["feature", "box"],
|
|
||||||
coords={"feature": features, "box": boxes},
|
|
||||||
name="box_centers",
|
|
||||||
attrs={"description": "Centers of the boxes in feature space."},
|
|
||||||
)
|
|
||||||
box_assignments = xr.DataArray(
|
|
||||||
best_estimator.Lambda_.cpu().numpy(),
|
|
||||||
dims=["class", "box"],
|
|
||||||
coords={"class": training_data.y.labels, "box": boxes},
|
|
||||||
name="box_assignments",
|
|
||||||
attrs={"description": "Assignments of samples to boxes."},
|
|
||||||
)
|
|
||||||
feature_weights = xr.DataArray(
|
|
||||||
best_estimator.W_.cpu().numpy(),
|
|
||||||
dims=["feature"],
|
|
||||||
coords={"feature": features},
|
|
||||||
name="feature_weights",
|
|
||||||
attrs={"description": "Feature weights for each box."},
|
|
||||||
)
|
|
||||||
state = xr.Dataset(
|
|
||||||
{
|
|
||||||
"box_centers": box_centers,
|
|
||||||
"box_assignments": box_assignments,
|
|
||||||
"feature_weights": feature_weights,
|
|
||||||
},
|
|
||||||
attrs={
|
|
||||||
"description": "Inner state of the best ESPAClassifier from RandomizedSearchCV.",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
state_file = results_dir / "best_estimator_state.nc"
|
state_file = results_dir / "best_estimator_state.nc"
|
||||||
print(f"Storing best estimator state to {state_file}")
|
print(f"Storing best estimator state to {state_file}")
|
||||||
state.to_netcdf(state_file, engine="h5netcdf")
|
state.to_netcdf(state_file, engine="h5netcdf")
|
||||||
|
|
||||||
elif settings.model == "xgboost":
|
elif settings.model == "xgboost":
|
||||||
# Extract XGBoost-specific information
|
state = extract_xgboost_feature_importance(best_estimator, training_data)
|
||||||
# Get the underlying booster
|
|
||||||
booster = best_estimator.get_booster()
|
|
||||||
|
|
||||||
# Feature importance with different importance types
|
|
||||||
# Note: get_score() returns dict with keys like 'f0', 'f1', etc. (feature indices)
|
|
||||||
importance_weight = booster.get_score(importance_type="weight")
|
|
||||||
importance_gain = booster.get_score(importance_type="gain")
|
|
||||||
importance_cover = booster.get_score(importance_type="cover")
|
|
||||||
importance_total_gain = booster.get_score(importance_type="total_gain")
|
|
||||||
importance_total_cover = booster.get_score(importance_type="total_cover")
|
|
||||||
|
|
||||||
# Create aligned arrays for all features (including zero-importance)
|
|
||||||
def align_importance(importance_dict, features):
|
|
||||||
"""Align importance dict to feature list, filling missing with 0.
|
|
||||||
|
|
||||||
XGBoost returns feature indices (f0, f1, ...) as keys, so we need to map them.
|
|
||||||
"""
|
|
||||||
return [importance_dict.get(f"f{i}", 0.0) for i in range(len(features))]
|
|
||||||
|
|
||||||
feature_importance_weight = xr.DataArray(
|
|
||||||
align_importance(importance_weight, features),
|
|
||||||
dims=["feature"],
|
|
||||||
coords={"feature": features},
|
|
||||||
name="feature_importance_weight",
|
|
||||||
attrs={"description": "Number of times a feature is used to split the data across all trees."},
|
|
||||||
)
|
|
||||||
feature_importance_gain = xr.DataArray(
|
|
||||||
align_importance(importance_gain, features),
|
|
||||||
dims=["feature"],
|
|
||||||
coords={"feature": features},
|
|
||||||
name="feature_importance_gain",
|
|
||||||
attrs={"description": "Average gain across all splits the feature is used in."},
|
|
||||||
)
|
|
||||||
feature_importance_cover = xr.DataArray(
|
|
||||||
align_importance(importance_cover, features),
|
|
||||||
dims=["feature"],
|
|
||||||
coords={"feature": features},
|
|
||||||
name="feature_importance_cover",
|
|
||||||
attrs={"description": "Average coverage across all splits the feature is used in."},
|
|
||||||
)
|
|
||||||
feature_importance_total_gain = xr.DataArray(
|
|
||||||
align_importance(importance_total_gain, features),
|
|
||||||
dims=["feature"],
|
|
||||||
coords={"feature": features},
|
|
||||||
name="feature_importance_total_gain",
|
|
||||||
attrs={"description": "Total gain across all splits the feature is used in."},
|
|
||||||
)
|
|
||||||
feature_importance_total_cover = xr.DataArray(
|
|
||||||
align_importance(importance_total_cover, features),
|
|
||||||
dims=["feature"],
|
|
||||||
coords={"feature": features},
|
|
||||||
name="feature_importance_total_cover",
|
|
||||||
attrs={"description": "Total coverage across all splits the feature is used in."},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Store tree information
|
|
||||||
n_trees = booster.num_boosted_rounds()
|
|
||||||
|
|
||||||
state = xr.Dataset(
|
|
||||||
{
|
|
||||||
"feature_importance_weight": feature_importance_weight,
|
|
||||||
"feature_importance_gain": feature_importance_gain,
|
|
||||||
"feature_importance_cover": feature_importance_cover,
|
|
||||||
"feature_importance_total_gain": feature_importance_total_gain,
|
|
||||||
"feature_importance_total_cover": feature_importance_total_cover,
|
|
||||||
},
|
|
||||||
attrs={
|
|
||||||
"description": "Inner state of the best XGBClassifier from RandomizedSearchCV.",
|
|
||||||
"n_trees": n_trees,
|
|
||||||
"objective": str(best_estimator.objective),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
state_file = results_dir / "best_estimator_state.nc"
|
state_file = results_dir / "best_estimator_state.nc"
|
||||||
print(f"Storing best estimator state to {state_file}")
|
print(f"Storing best estimator state to {state_file}")
|
||||||
state.to_netcdf(state_file, engine="h5netcdf")
|
state.to_netcdf(state_file, engine="h5netcdf")
|
||||||
|
|
||||||
elif settings.model == "rf":
|
elif settings.model == "rf":
|
||||||
# Extract Random Forest-specific information
|
state = extract_rf_feature_importance(best_estimator, training_data)
|
||||||
# Note: cuML's RandomForestClassifier doesn't expose individual trees (estimators_)
|
|
||||||
# like sklearn does, so we can only extract feature importances and model parameters
|
|
||||||
|
|
||||||
# Feature importances (Gini importance)
|
|
||||||
feature_importances = best_estimator.feature_importances_
|
|
||||||
|
|
||||||
feature_importance = xr.DataArray(
|
|
||||||
feature_importances,
|
|
||||||
dims=["feature"],
|
|
||||||
coords={"feature": features},
|
|
||||||
name="feature_importance",
|
|
||||||
attrs={"description": "Gini importance (impurity-based feature importance)."},
|
|
||||||
)
|
|
||||||
|
|
||||||
# cuML RF doesn't expose individual trees, so we store model parameters instead
|
|
||||||
n_estimators = best_estimator.n_estimators
|
|
||||||
max_depth = best_estimator.max_depth
|
|
||||||
|
|
||||||
# OOB score if available
|
|
||||||
oob_score = None
|
|
||||||
if hasattr(best_estimator, "oob_score_") and best_estimator.oob_score:
|
|
||||||
oob_score = float(best_estimator.oob_score_)
|
|
||||||
|
|
||||||
# cuML RandomForest doesn't provide per-tree statistics like sklearn
|
|
||||||
# Store what we have: feature importances and model configuration
|
|
||||||
attrs = {
|
|
||||||
"description": "Inner state of the best RandomForestClassifier from RandomizedSearchCV (cuML).",
|
|
||||||
"n_estimators": int(n_estimators),
|
|
||||||
"note": "cuML RandomForest does not expose individual tree statistics like sklearn",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Only add optional attributes if they have values
|
|
||||||
if max_depth is not None:
|
|
||||||
attrs["max_depth"] = int(max_depth)
|
|
||||||
if oob_score is not None:
|
|
||||||
attrs["oob_score"] = oob_score
|
|
||||||
|
|
||||||
state = xr.Dataset(
|
|
||||||
{
|
|
||||||
"feature_importance": feature_importance,
|
|
||||||
},
|
|
||||||
attrs=attrs,
|
|
||||||
)
|
|
||||||
state_file = results_dir / "best_estimator_state.nc"
|
state_file = results_dir / "best_estimator_state.nc"
|
||||||
print(f"Storing best estimator state to {state_file}")
|
print(f"Storing best estimator state to {state_file}")
|
||||||
state.to_netcdf(state_file, engine="h5netcdf")
|
state.to_netcdf(state_file, engine="h5netcdf")
|
||||||
|
|
||||||
elif settings.model == "knn":
|
|
||||||
# KNN doesn't have traditional feature importance
|
|
||||||
# Store information about the training data and neighbors
|
|
||||||
n_neighbors = best_estimator.n_neighbors
|
|
||||||
|
|
||||||
# We can't extract meaningful "state" from KNN in the same way
|
|
||||||
# but we can store metadata about the model
|
|
||||||
state = xr.Dataset(
|
|
||||||
attrs={
|
|
||||||
"description": "Metadata of the best KNeighborsClassifier from RandomizedSearchCV.",
|
|
||||||
"n_neighbors": n_neighbors,
|
|
||||||
"weights": str(best_estimator.weights),
|
|
||||||
"algorithm": str(best_estimator.algorithm),
|
|
||||||
"metric": str(best_estimator.metric),
|
|
||||||
"n_samples_fit": best_estimator.n_samples_fit_,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
state_file = results_dir / "best_estimator_state.nc"
|
|
||||||
print(f"Storing best estimator metadata to {state_file}")
|
|
||||||
state.to_netcdf(state_file, engine="h5netcdf")
|
|
||||||
|
|
||||||
# Predict probabilities for all cells
|
# Predict probabilities for all cells
|
||||||
print("Predicting probabilities for all cells...")
|
print("Predicting probabilities for all cells...")
|
||||||
preds = predict_proba(dataset_ensemble, clf=best_estimator, classes=training_data.y.labels)
|
preds = predict_proba(dataset_ensemble, model=best_estimator, device=device)
|
||||||
|
if training_data.targets["y"].dtype == "category":
|
||||||
|
preds["predicted"] = preds["predicted"].astype("category")
|
||||||
|
preds["predicted"].cat.categories = training_data.targets["y"].cat.categories
|
||||||
print(f"Predicted probabilities DataFrame with {len(preds)} entries.")
|
print(f"Predicted probabilities DataFrame with {len(preds)} entries.")
|
||||||
preds_file = results_dir / "predicted_probabilities.parquet"
|
preds_file = results_dir / "predicted_probabilities.parquet"
|
||||||
print(f"Storing predicted probabilities to {preds_file}")
|
print(f"Storing predicted probabilities to {preds_file}")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue