Source code for veupath_chatbot.ai.models.catalog

"""Central model catalog — single source of truth for available LLM models.

Each entry carries enough metadata for the frontend to render a grouped
dropdown and for the backend to validate per-request overrides.

Cloud models are hardcoded.  Ollama (local) models are loaded from an
optional YAML file pointed to by ``OLLAMA_MODELS_CONFIG``.
"""

from dataclasses import dataclass
from functools import lru_cache

from veupath_chatbot.platform.types import ModelProvider, ReasoningEffort

__all__ = [
    "ModelEntry",
    "ModelProvider",
    "ReasoningEffort",
    "build_reasoning_hyperparams",
    "get_model_catalog",
    "get_model_entry",
]

# OpenAI reasoning models (gpt-5*, o1, o3, o4) use the flat
# ``reasoning_effort`` param accepted by ``chat.completions.create()``.
_OPENAI_EFFORT_MAP: dict[ReasoningEffort, dict[str, object]] = {
    "none": {"reasoning_effort": "none"},
    "low": {"reasoning_effort": "low"},
    "medium": {},  # server default
    "high": {"reasoning_effort": "high"},
}

# Anthropic extended thinking uses ``thinking`` param with a budget.
_ANTHROPIC_EFFORT_MAP: dict[ReasoningEffort, dict[str, object]] = {
    "none": {},
    "low": {"thinking": {"type": "enabled", "budget_tokens": 1024}},
    "medium": {"thinking": {"type": "enabled", "budget_tokens": 8192}},
    "high": {"thinking": {"type": "enabled", "budget_tokens": 32768}},
}

# Google Gemini 2.5 uses ``thinking_config`` passed directly to
# ``GenerateContentConfig`` (not nested under ``generation_config``).
# A budget of 0 disables thinking; -1 = automatic (server decides).
_GOOGLE_EFFORT_MAP: dict[ReasoningEffort, dict[str, object]] = {
    "none": {"thinking_config": {"thinking_budget": 0}},
    "low": {"thinking_config": {"thinking_budget": 1024}},
    "medium": {"thinking_config": {"thinking_budget": 8192}},
    "high": {"thinking_config": {"thinking_budget": 24576}},
}

_EFFORT_MAPS: dict[ModelProvider, dict[ReasoningEffort, dict[str, object]]] = {
    "openai": _OPENAI_EFFORT_MAP,
    "anthropic": _ANTHROPIC_EFFORT_MAP,
    "google": _GOOGLE_EFFORT_MAP,
    # Ollama models generally don't support reasoning effort params.
    "ollama": {"none": {}, "low": {}, "medium": {}, "high": {}},
    # Mock engine ignores reasoning effort.
    "mock": {"none": {}, "low": {}, "medium": {}, "high": {}},
}



[docs]
def build_reasoning_hyperparams(
    provider: ModelProvider,
    effort: ReasoningEffort | None,
    *,
    budget_override: int | None = None,
) -> dict[str, object]:
    """Return provider-specific hyperparams that implement *effort*.

    :param provider: Model provider.
    :param effort: Reasoning effort (default: None).
    :param budget_override: Custom reasoning token budget (overrides effort map).
    :returns: Dict of provider-specific hyperparameters, or empty dict.
    """
    if effort is None:
        return {}
    effort_map = _EFFORT_MAPS.get(provider, {})
    params = dict(effort_map.get(effort, {}))
    if budget_override is not None and budget_override > 0:
        if provider == "anthropic":
            params["thinking"] = {"type": "enabled", "budget_tokens": budget_override}
        elif provider == "google":
            params["thinking_config"] = {"thinking_budget": budget_override}
    return params




[docs]
@dataclass(frozen=True, slots=True)
class ModelEntry:
    """A single model in the catalog."""

    id: str  # e.g. "openai/gpt-5"
    name: str  # human-readable display name
    provider: ModelProvider
    model: str  # provider-native model ID (e.g. "gpt-5")
    description: str = ""
    supports_reasoning: bool = False
    context_size: int = 0  # known context window; 0 = use engine default
    default_reasoning_budget: int = (
        0  # default reasoning token budget at "medium" effort
    )
    input_price: float = 0.0  # USD per 1M input tokens
    cached_input_price: float = 0.0  # USD per 1M cached input tokens
    output_price: float = 0.0  # USD per 1M output tokens



# Cloud models — always present.
_CLOUD_MODELS: tuple[ModelEntry, ...] = (
    # OpenAI
    ModelEntry(
        id="openai/gpt-4.1",
        name="GPT-4.1",
        provider="openai",
        model="gpt-4.1",
        description="Default workhorse — 1M context",
        context_size=1_047_576,
        input_price=2.00,
        cached_input_price=0.50,
        output_price=8.00,
    ),
    ModelEntry(
        id="openai/gpt-4.1-mini",
        name="GPT-4.1 Mini",
        provider="openai",
        model="gpt-4.1-mini",
        description="Fast and cheap with full context",
        context_size=1_047_576,
        input_price=0.20,
        cached_input_price=0.10,
        output_price=0.80,
    ),
    ModelEntry(
        id="openai/gpt-4.1-nano",
        name="GPT-4.1 Nano",
        provider="openai",
        model="gpt-4.1-nano",
        description="Ultra-cheap for simple tasks",
        context_size=1_047_576,
        input_price=0.05,
        cached_input_price=0.025,
        output_price=0.20,
    ),
    ModelEntry(
        id="openai/gpt-5",
        name="GPT-5",
        provider="openai",
        model="gpt-5",
        description="Smartest OpenAI model",
        supports_reasoning=True,
        context_size=400_000,
        input_price=1.25,
        cached_input_price=0.125,
        output_price=10.00,
    ),
    ModelEntry(
        id="openai/gpt-5-mini",
        name="GPT-5 Mini",
        provider="openai",
        model="gpt-5-mini",
        description="Smart and budget-friendly",
        supports_reasoning=True,
        context_size=400_000,
        input_price=0.125,
        cached_input_price=0.025,
        output_price=1.00,
    ),
    ModelEntry(
        id="openai/gpt-5-nano",
        name="GPT-5 Nano",
        provider="openai",
        model="gpt-5-nano",
        description="Ultra-cheap, smaller context",
        supports_reasoning=True,
        context_size=400_000,
        input_price=0.05,
        cached_input_price=0.005,
        output_price=0.40,
    ),
    ModelEntry(
        id="openai/gpt-5.4",
        name="GPT-5.4",
        provider="openai",
        model="gpt-5.4",
        description="Latest flagship — 1.1M context",
        supports_reasoning=True,
        context_size=1_100_000,
        input_price=2.50,
        cached_input_price=0.25,
        output_price=15.00,
    ),
    ModelEntry(
        id="openai/o3",
        name="o3",
        provider="openai",
        model="o3",
        description="Reasoning-focused",
        supports_reasoning=True,
        context_size=200_000,
        input_price=2.00,
        cached_input_price=0.50,
        output_price=8.00,
    ),
    ModelEntry(
        id="openai/o4-mini",
        name="o4 Mini",
        provider="openai",
        model="o4-mini",
        description="Cheap reasoning",
        supports_reasoning=True,
        context_size=200_000,
        input_price=1.10,
        cached_input_price=0.275,
        output_price=4.40,
    ),
    # Anthropic
    ModelEntry(
        id="anthropic/claude-opus-4-6",
        name="Claude Opus 4.6",
        provider="anthropic",
        model="claude-opus-4-6",
        description="Most capable Anthropic model",
        supports_reasoning=True,
        context_size=1_000_000,
        default_reasoning_budget=8192,
        input_price=5.00,
        cached_input_price=0.50,
        output_price=25.00,
    ),
    ModelEntry(
        id="anthropic/claude-sonnet-4-6",
        name="Claude Sonnet 4.6",
        provider="anthropic",
        model="claude-sonnet-4-6",
        description="Balanced speed and intelligence",
        supports_reasoning=True,
        context_size=1_000_000,
        default_reasoning_budget=8192,
        input_price=3.00,
        cached_input_price=0.30,
        output_price=15.00,
    ),
    ModelEntry(
        id="anthropic/claude-haiku-4-5",
        name="Claude Haiku 4.5",
        provider="anthropic",
        model="claude-haiku-4-5-20251001",
        description="Fastest Anthropic model",
        supports_reasoning=True,
        context_size=200_000,
        default_reasoning_budget=8192,
        input_price=1.00,
        cached_input_price=0.10,
        output_price=5.00,
    ),
    # Google
    ModelEntry(
        id="google/gemini-2.5-pro",
        name="Gemini 2.5 Pro",
        provider="google",
        model="gemini-2.5-pro",
        description="Best Google — deep reasoning",
        supports_reasoning=True,
        context_size=1_048_576,
        default_reasoning_budget=8192,
        input_price=1.25,
        cached_input_price=0.125,
        output_price=10.00,
    ),
    ModelEntry(
        id="google/gemini-3.1-pro",
        name="Gemini 3.1 Pro",
        provider="google",
        model="gemini-3.1-pro-preview",
        description="Latest Google flagship",
        supports_reasoning=True,
        context_size=1_000_000,
        default_reasoning_budget=8192,
        input_price=2.00,
        cached_input_price=0.20,
        output_price=12.00,
    ),
    # Mock (deterministic E2E testing)
    ModelEntry(
        id="mock/deterministic",
        name="Mock (deterministic)",
        provider="mock",
        model="deterministic",
        description="Deterministic mock for E2E testing — no LLM calls",
        context_size=128_000,
    ),
)


def _load_ollama_models() -> tuple[ModelEntry, ...]:
    """Load Ollama model entries from the YAML config file.

    YAML format (``ollama_models.yaml``)::

        models:
          - model: llama3
            name: Llama 3
          - model: mistral
            name: Mistral 7B
          - model: qwen3
            name: Qwen 3
    """
    from veupath_chatbot.platform.config import _REPO_ROOT

    path = _REPO_ROOT / "ollama_models.yaml"
    if not path.is_file():
        return ()

    import yaml

    with path.open() as f:
        data = yaml.safe_load(f)

    if not data:
        return ()

    entries: list[ModelEntry] = []
    seen: set[str] = set()
    for item in data.get("models", []):
        model_name = item.get("model", "")
        if not model_name or model_name in seen:
            continue
        display = item.get("name", model_name)
        thinking = bool(item.get("thinking", False))
        context_size = item.get("context_size")
        entries.append(
            ModelEntry(
                id=f"ollama/{model_name}",
                name=f"{display} (local)",
                provider="ollama",
                model=model_name,
                supports_reasoning=thinking,
                context_size=int(context_size) if context_size else 0,
            )
        )
        seen.add(model_name)

    return tuple(entries)



[docs]
@lru_cache
def get_model_catalog() -> tuple[ModelEntry, ...]:
    """Return the full model catalog (cloud + local)."""
    return _CLOUD_MODELS + _load_ollama_models()



def _build_index() -> dict[str, ModelEntry]:
    return {m.id: m for m in get_model_catalog()}



[docs]
def get_model_entry(model_id: str) -> ModelEntry | None:
    """Look up a model by catalog ID.

    :param model_id: Model identifier (e.g. ``openai/gpt-5``).
    :returns: Model entry if found, otherwise None.
    """
    return _build_index().get(model_id)