Source code for veupath_chatbot.domain.research.citations

"""Citation domain types and utilities."""

import re
from dataclasses import dataclass
from datetime import UTC, datetime
from string import ascii_lowercase
from typing import Literal, cast
from uuid import uuid4

from veupath_chatbot.platform.types import JSONObject, JSONValue

CitationSource = Literal[
    "web",
    "europepmc",
    "crossref",
    "openalex",
    "semanticscholar",
    "pubmed",
    "arxiv",
    "biorxiv",
    "medrxiv",
]
LiteratureSource = Literal[
    "europepmc",
    "crossref",
    "openalex",
    "semanticscholar",
    "pubmed",
    "arxiv",
    "biorxiv",
    "medrxiv",
    "all",
]
LiteratureSort = Literal["relevance", "newest"]



[docs]
@dataclass(frozen=True)
class Citation:
    id: str
    source: CitationSource
    title: str
    url: str | None = None
    authors: list[str] | None = None
    year: int | None = None
    doi: str | None = None
    pmid: str | None = None
    snippet: str | None = None
    accessed_at: str | None = None


[docs]
    def to_dict(self) -> JSONObject:
        tag = _suggest_citation_tag(
            source=self.source,
            title=self.title,
            authors=self.authors,
            year=self.year,
            doi=self.doi,
            pmid=self.pmid,
            url=self.url,
        )
        return {
            "id": self.id,
            "source": self.source,
            "tag": tag,
            "title": self.title,
            "url": self.url,
            "authors": cast(JSONValue, self.authors),
            "year": self.year,
            "doi": self.doi,
            "pmid": self.pmid,
            "snippet": self.snippet,
            "accessedAt": self.accessed_at,
        }




def _now_iso() -> str:
    return datetime.now(UTC).isoformat()


def _new_citation_id(prefix: str) -> str:
    return f"{prefix}_{uuid4().hex[:12]}"


def _slug_token(value: str | None, *, max_len: int = 32) -> str:
    if not isinstance(value, str):
        return ""
    t = value.strip().lower()
    if not t:
        return ""
    t = re.sub(r"[^a-z0-9]+", "", t)
    return t[:max_len]


def _suggest_citation_tag(
    *,
    source: CitationSource,
    title: str,
    authors: list[str] | None,
    year: int | None,
    doi: str | None,
    pmid: str | None,
    url: str | None,
) -> str:
    first_author: str | None = None
    if authors and len(authors) > 0:
        first_author = authors[0] if isinstance(authors[0], str) else None
    parts = str(first_author).split(",")[0].split() if first_author else []
    first_last = _slug_token(parts[0]) if parts else ""
    if first_last and year:
        return f"{first_last}{year}"
    if first_last:
        return first_last

    first_word = (
        _slug_token(title.split()[0])
        if isinstance(title, str) and title.split()
        else ""
    )
    if first_word and year:
        return f"{first_word}{year}"

    title_slug = _slug_token(title, max_len=20)
    if title_slug:
        return title_slug

    stable = _slug_token(doi or pmid or url, max_len=20)
    return stable or str(source)



[docs]
def ensure_unique_citation_tags(citations: list[JSONObject]) -> None:
    """Ensure all citation tags are unique by appending suffixes if needed.

    :param citations: Citation objects.

    """
    used: dict[str, int] = {}
    for c in citations:
        if not isinstance(c, dict):
            continue
        base = _slug_token(str(c.get("tag") or ""), max_len=40) or "ref"
        n = used.get(base, 0)
        if n == 0:
            tag = base
        else:
            if n <= len(ascii_lowercase):
                tag = f"{base}{ascii_lowercase[n - 1]}"
            else:
                tag = f"{base}_{n + 1}"
        used[base] = n + 1
        c["tag"] = tag



# Private utilities exported for use by services module
# These are implementation details but need to be accessible