"""Citation domain types and utilities."""
import re
from dataclasses import dataclass
from datetime import UTC, datetime
from string import ascii_lowercase
from typing import Literal, cast
from uuid import uuid4
from veupath_chatbot.platform.types import JSONObject, JSONValue
CitationSource = Literal[
"web",
"europepmc",
"crossref",
"openalex",
"semanticscholar",
"pubmed",
"arxiv",
"biorxiv",
"medrxiv",
]
LiteratureSource = Literal[
"europepmc",
"crossref",
"openalex",
"semanticscholar",
"pubmed",
"arxiv",
"biorxiv",
"medrxiv",
"all",
]
LiteratureSort = Literal["relevance", "newest"]
[docs]
@dataclass(frozen=True)
class Citation:
id: str
source: CitationSource
title: str
url: str | None = None
authors: list[str] | None = None
year: int | None = None
doi: str | None = None
pmid: str | None = None
snippet: str | None = None
accessed_at: str | None = None
[docs]
def to_dict(self) -> JSONObject:
tag = _suggest_citation_tag(
source=self.source,
title=self.title,
authors=self.authors,
year=self.year,
doi=self.doi,
pmid=self.pmid,
url=self.url,
)
return {
"id": self.id,
"source": self.source,
"tag": tag,
"title": self.title,
"url": self.url,
"authors": cast(JSONValue, self.authors),
"year": self.year,
"doi": self.doi,
"pmid": self.pmid,
"snippet": self.snippet,
"accessedAt": self.accessed_at,
}
def _now_iso() -> str:
return datetime.now(UTC).isoformat()
def _new_citation_id(prefix: str) -> str:
return f"{prefix}_{uuid4().hex[:12]}"
def _slug_token(value: str | None, *, max_len: int = 32) -> str:
if not isinstance(value, str):
return ""
t = value.strip().lower()
if not t:
return ""
t = re.sub(r"[^a-z0-9]+", "", t)
return t[:max_len]
def _suggest_citation_tag(
*,
source: CitationSource,
title: str,
authors: list[str] | None,
year: int | None,
doi: str | None,
pmid: str | None,
url: str | None,
) -> str:
first_author: str | None = None
if authors and len(authors) > 0:
first_author = authors[0] if isinstance(authors[0], str) else None
parts = str(first_author).split(",")[0].split() if first_author else []
first_last = _slug_token(parts[0]) if parts else ""
if first_last and year:
return f"{first_last}{year}"
if first_last:
return first_last
first_word = (
_slug_token(title.split()[0])
if isinstance(title, str) and title.split()
else ""
)
if first_word and year:
return f"{first_word}{year}"
title_slug = _slug_token(title, max_len=20)
if title_slug:
return title_slug
stable = _slug_token(doi or pmid or url, max_len=20)
return stable or str(source)
# Private utilities exported for use by services module
# These are implementation details but need to be accessible