"""PubMed API client."""
import re
from typing import cast
import httpx
from veupath_chatbot.platform.types import JSONObject, JSONValue
from veupath_chatbot.services.research.clients._base import (
API_USER_AGENT,
BaseClient,
build_response,
make_citation,
)
from veupath_chatbot.services.research.utils import strip_tags, truncate_text
[docs]
class PubmedClient(BaseClient):
"""Client for PubMed API.
PubMed requires a multi-step fetch (esearch -> esummary -> optional
efetch for abstracts), so it keeps a custom ``search`` method.
Per-item parsing still goes through ``_parse_item`` / ``_build_results``.
"""
_include_abstract: bool = False
[docs]
async def search(
self,
query: str,
*,
limit: int,
include_abstract: bool,
abstract_max_chars: int,
) -> JSONObject:
"""Search PubMed."""
raw_items = await self._fetch_raw(
query,
limit=limit,
include_abstract=include_abstract,
)
if not raw_items:
return build_response(
query=query, source="pubmed", results=[], citations=[]
)
self._include_abstract = include_abstract
results, citations = self._build_results(
raw_items, abstract_max_chars=abstract_max_chars
)
return build_response(
query=query, source="pubmed", results=results, citations=citations
)
# -- fetch -------------------------------------------------------------
async def _fetch_raw(
self, query: str, *, limit: int, include_abstract: bool
) -> list[JSONValue]:
"""esearch + esummary (+ optional efetch) -> list of per-PMID dicts."""
async with httpx.AsyncClient(
timeout=self._timeout, headers={"User-Agent": API_USER_AGENT}
) as client:
esearch = await client.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
params={
"db": "pubmed",
"term": query,
"retmax": str(limit),
"retmode": "json",
},
)
esearch.raise_for_status()
search_payload = esearch.json()
idlist = (
(search_payload.get("esearchresult") or {}).get("idlist") or []
if isinstance(search_payload, dict)
else []
)
pmids = [str(x) for x in idlist if str(x).strip()]
if not pmids:
return []
esummary = await client.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",
params={"db": "pubmed", "id": ",".join(pmids), "retmode": "json"},
)
esummary.raise_for_status()
sum_payload = esummary.json()
sum_result = (
sum_payload.get("result") if isinstance(sum_payload, dict) else {}
)
abstracts_by_pmid: dict[str, str] = {}
if include_abstract:
efetch = await client.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
params={"db": "pubmed", "id": ",".join(pmids), "retmode": "xml"},
)
efetch.raise_for_status()
xml = efetch.text or ""
for pmid in pmids:
m = re.search(
rf"<PMID>{re.escape(pmid)}</PMID>.*?<Abstract>.*?<AbstractText[^>]*>(.*?)</AbstractText>",
xml,
flags=re.IGNORECASE | re.DOTALL,
)
if m:
abstracts_by_pmid[pmid] = strip_tags(m.group(1))
items: list[JSONValue] = []
for pmid in pmids:
meta = sum_result.get(pmid) if isinstance(sum_result, dict) else None
if not isinstance(meta, dict):
continue
items.append(
{
"_pmid": pmid,
"_meta": meta,
"_abstract": abstracts_by_pmid.get(pmid),
}
)
return items
# -- parse -------------------------------------------------------------
def _parse_item(
self, raw: JSONValue, *, abstract_max_chars: int
) -> tuple[JSONObject, JSONObject] | None:
if not isinstance(raw, dict):
return None
pmid = raw.get("_pmid")
meta = raw.get("_meta")
if not isinstance(pmid, str) or not isinstance(meta, dict):
return None
title = str(meta.get("title") or "").strip()
pubdate = str(meta.get("pubdate") or "")
year: int | None = None
m_year = re.search(r"(\d{4})", pubdate)
if m_year:
try:
year = int(m_year.group(1))
except Exception:
year = None
authors: list[str] | None = None
raw_authors = meta.get("authors")
if isinstance(raw_authors, list):
authors = [
str(a.get("name"))
for a in raw_authors
if isinstance(a, dict) and a.get("name")
]
journal = meta.get("fulljournalname")
journal = str(journal).strip() if journal else None
abstract_text = raw.get("_abstract")
abstract: str | None = abstract_text if isinstance(abstract_text, str) else None
abstract = (
truncate_text(abstract, abstract_max_chars)
if self._include_abstract
else None
)
url_item = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
result: JSONObject = {
"title": title,
"year": year,
"pmid": pmid,
"url": url_item,
"authors": cast(JSONValue, authors),
"journalTitle": journal,
"abstract": abstract,
"snippet": abstract or journal,
}
citation = make_citation(
source="pubmed",
id_prefix="pubmed",
title=title or url_item,
url=url_item,
authors=authors,
year=year,
pmid=pmid,
snippet=abstract or journal,
)
return result, citation