Source code for calibration.cross_eval

"""Cross-scenario evaluation -- run configs across multiple scenarios.

Evaluates parameter configurations on all specified scenarios simultaneously
and ranks using cross-scenario criteria.
"""

from __future__ import annotations

import argparse
import math
import statistics
from pathlib import Path
from typing import Any

from calibration.analysis import CalibrationResult, ScenarioResult
from calibration.io import OUTPUT_DIR, save_stability
from calibration.stability import _evaluate_single_seed


[docs] def rank_cross_scenario( results: list[CalibrationResult], strategy: str = "stability-first", ) -> list[CalibrationResult]: """Rank configs using cross-scenario criteria. Parameters ---------- results : list[CalibrationResult] Results with ``scenario_results`` populated. strategy : str Ranking strategy: - "stability-first": min(pass_rates) -> total fails -> min(combined) - "score-first": min(combined) -> total fails - "balanced": geometric mean of combined scores Returns ------- list[CalibrationResult] Sorted results (best first). Raises ------ ValueError If strategy is not recognized. """ if strategy == "stability-first": results.sort( key=lambda r: ( min(sr.pass_rate for sr in (r.scenario_results or {}).values()), -sum(sr.n_fail for sr in (r.scenario_results or {}).values()), min(sr.combined_score for sr in (r.scenario_results or {}).values()), ), reverse=True, ) elif strategy == "score-first": results.sort( key=lambda r: ( min(sr.combined_score for sr in (r.scenario_results or {}).values()), -sum(sr.n_fail for sr in (r.scenario_results or {}).values()), ), reverse=True, ) elif strategy == "balanced": def geomean(r: CalibrationResult) -> float: scores = [sr.combined_score for sr in (r.scenario_results or {}).values()] if not scores: return 0.0 return math.exp(sum(math.log(max(s, 1e-10)) for s in scores) / len(scores)) results.sort(key=geomean, reverse=True) else: raise ValueError( f"Unknown ranking strategy: '{strategy}'. " f"Available: stability-first, score-first, balanced" ) return results
[docs] def evaluate_cross_scenario( configs: list[dict[str, Any]], scenarios: list[str], n_seeds: int = 100, n_periods: int = 1000, n_workers: int = 10, ) -> list[CalibrationResult]: """Evaluate configs across multiple scenarios. Parameters ---------- configs : list[dict] Parameter configurations to evaluate. scenarios : list[str] Scenario names to evaluate on. n_seeds : int Seeds per scenario per config. n_periods : int Simulation periods. n_workers : int Parallel workers. Returns ------- list[CalibrationResult] Results with scenario_results populated. """ from concurrent.futures import ProcessPoolExecutor, as_completed results: list[CalibrationResult] = [] for cfg_idx, config in enumerate(configs): scenario_results: dict[str, ScenarioResult] = {} for scenario in scenarios: seeds = list(range(n_seeds)) scores: list[float] = [] fails: list[int] = [] print( f" Config {cfg_idx + 1}/{len(configs)}, " f"scenario={scenario}, {n_seeds} seeds..." ) if n_workers > 1: with ProcessPoolExecutor(max_workers=n_workers) as executor: futures = [ executor.submit( _evaluate_single_seed, config, scenario, seed, n_periods ) for seed in seeds ] for future in as_completed(futures): _, _, score, n_fail = future.result() scores.append(score) fails.append(n_fail) else: for seed in seeds: _, _, score, n_fail = _evaluate_single_seed( config, scenario, seed, n_periods ) scores.append(score) fails.append(n_fail) mean = statistics.mean(scores) std = statistics.stdev(scores) if len(scores) > 1 else 0.0 n_passed = sum(1 for nf in fails if nf == 0) pass_rate = n_passed / len(fails) combined = mean * pass_rate * (1.0 - std) total_fails = sum(1 for nf in fails if nf > 0) scenario_results[scenario] = ScenarioResult( mean_score=mean, std_score=std, combined_score=combined, pass_rate=pass_rate, n_fail=total_fails, seed_scores=scores, ) results.append( CalibrationResult.from_cross_eval( params=config, scenario_results=scenario_results, ) ) return results
def _load_configs(path: Path) -> list[dict[str, Any]]: """Load configs from screening/stability JSON or YAML grid file. Auto-detects format: - JSON with ``results`` key: screening or stability result file - YAML: treated as a parameter grid (generates all combinations) """ if path.suffix in (".yml", ".yaml"): import yaml with open(path) as f: grid = yaml.safe_load(f) or {} from calibration.grid import generate_combinations return list(generate_combinations(grid)) import json with open(path) as f: data = json.load(f) return [r["params"] for r in data["results"]]
[docs] def compute_scenario_tension( results: list[CalibrationResult], scenarios: list[str], ) -> dict[str, dict[str, Any]]: """Analyze parameter tensions between scenarios. Identifies params where the optimal value differs between scenarios, indicating a fundamental trade-off. Parameters ---------- results : list[CalibrationResult] Results with ``scenario_results`` populated. scenarios : list[str] Scenario names to compare. Returns ------- dict[str, dict] Per-parameter tension info: which value each scenario prefers, and the score gap. """ if len(results) < 2 or len(scenarios) < 2: return {} # For each scenario, find the best config (by combined score for that scenario) best_per_scenario: dict[str, dict[str, Any]] = {} for scenario in scenarios: best = max( results, key=lambda r: ( (r.scenario_results or {}) .get(scenario, ScenarioResult(0, 0, 0, 0, 0, [])) .combined_score ), ) best_per_scenario[scenario] = dict(best.params) # Identify params that differ tension: dict[str, dict[str, Any]] = {} all_params: set[str] = set() for params in best_per_scenario.values(): all_params.update(params.keys()) for param in sorted(all_params): values = {s: best_per_scenario[s].get(param) for s in scenarios} unique_values = set(values.values()) if len(unique_values) > 1: tension[param] = {"preferred_by": values} return tension
[docs] def run_cross_eval_phase(args: argparse.Namespace, run_dir: Path | None = None) -> None: """CLI entry point for cross-eval phase.""" if not args.scenarios: raise SystemExit("--scenarios is required for cross-eval phase") if not args.configs: raise SystemExit("--configs is required for cross-eval phase") scenarios = [s.strip() for s in args.scenarios.split(",")] configs = _load_configs(Path(args.configs)) print(f"[cross-eval] {len(configs)} configs x {len(scenarios)} scenarios") print(f"[cross-eval] Scenarios: {scenarios}") results = evaluate_cross_scenario( configs=configs, scenarios=scenarios, n_seeds=args.sensitivity_seeds, n_periods=args.periods, n_workers=args.workers, ) # Default to stability-first for cross-eval; argparse validates choices rank_by = ( args.rank_by if args.rank_by in ("stability-first", "score-first", "balanced") else "stability-first" ) results = rank_cross_scenario(results, rank_by) # Print summary print(f"\n Cross-Scenario Ranking ({rank_by}):") for i, r in enumerate(results[:10]): srs = r.scenario_results or {} min_pr = min((sr.pass_rate for sr in srs.values()), default=0) total_f = sum(sr.n_fail for sr in srs.values()) min_c = min((sr.combined_score for sr in srs.values()), default=0) print( f" #{i + 1}: min_pass={min_pr:.0%} total_fails={total_f} " f"min_combined={min_c:.4f}" ) # Scenario tension analysis tension = compute_scenario_tension(results, scenarios) if tension: print("\n Scenario Tension Analysis:") for param, info in tension.items(): prefs = ", ".join(f"{s}={v}" for s, v in info["preferred_by"].items()) print(f" {param}: {prefs}") # Save out = run_dir or OUTPUT_DIR out.mkdir(parents=True, exist_ok=True) save_stability(results, "cross_eval", out / "cross_eval_results.json") print(f"\nCross-eval results saved to {out}")