Source code for veupath_chatbot.services.research.clients.pubmed

"""PubMed API client."""

import re
from typing import cast

import httpx

from veupath_chatbot.platform.types import JSONObject, JSONValue
from veupath_chatbot.services.research.clients._base import (
    API_USER_AGENT,
    BaseClient,
    build_response,
    make_citation,
)
from veupath_chatbot.services.research.utils import strip_tags, truncate_text


[docs] class PubmedClient(BaseClient): """Client for PubMed API. PubMed requires a multi-step fetch (esearch -> esummary -> optional efetch for abstracts), so it keeps a custom ``search`` method. Per-item parsing still goes through ``_parse_item`` / ``_build_results``. """ _include_abstract: bool = False
[docs] async def search( self, query: str, *, limit: int, include_abstract: bool, abstract_max_chars: int, ) -> JSONObject: """Search PubMed.""" raw_items = await self._fetch_raw( query, limit=limit, include_abstract=include_abstract, ) if not raw_items: return build_response( query=query, source="pubmed", results=[], citations=[] ) self._include_abstract = include_abstract results, citations = self._build_results( raw_items, abstract_max_chars=abstract_max_chars ) return build_response( query=query, source="pubmed", results=results, citations=citations )
# -- fetch ------------------------------------------------------------- async def _fetch_raw( self, query: str, *, limit: int, include_abstract: bool ) -> list[JSONValue]: """esearch + esummary (+ optional efetch) -> list of per-PMID dicts.""" async with httpx.AsyncClient( timeout=self._timeout, headers={"User-Agent": API_USER_AGENT} ) as client: esearch = await client.get( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", params={ "db": "pubmed", "term": query, "retmax": str(limit), "retmode": "json", }, ) esearch.raise_for_status() search_payload = esearch.json() idlist = ( (search_payload.get("esearchresult") or {}).get("idlist") or [] if isinstance(search_payload, dict) else [] ) pmids = [str(x) for x in idlist if str(x).strip()] if not pmids: return [] esummary = await client.get( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi", params={"db": "pubmed", "id": ",".join(pmids), "retmode": "json"}, ) esummary.raise_for_status() sum_payload = esummary.json() sum_result = ( sum_payload.get("result") if isinstance(sum_payload, dict) else {} ) abstracts_by_pmid: dict[str, str] = {} if include_abstract: efetch = await client.get( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi", params={"db": "pubmed", "id": ",".join(pmids), "retmode": "xml"}, ) efetch.raise_for_status() xml = efetch.text or "" for pmid in pmids: m = re.search( rf"<PMID>{re.escape(pmid)}</PMID>.*?<Abstract>.*?<AbstractText[^>]*>(.*?)</AbstractText>", xml, flags=re.IGNORECASE | re.DOTALL, ) if m: abstracts_by_pmid[pmid] = strip_tags(m.group(1)) items: list[JSONValue] = [] for pmid in pmids: meta = sum_result.get(pmid) if isinstance(sum_result, dict) else None if not isinstance(meta, dict): continue items.append( { "_pmid": pmid, "_meta": meta, "_abstract": abstracts_by_pmid.get(pmid), } ) return items # -- parse ------------------------------------------------------------- def _parse_item( self, raw: JSONValue, *, abstract_max_chars: int ) -> tuple[JSONObject, JSONObject] | None: if not isinstance(raw, dict): return None pmid = raw.get("_pmid") meta = raw.get("_meta") if not isinstance(pmid, str) or not isinstance(meta, dict): return None title = str(meta.get("title") or "").strip() pubdate = str(meta.get("pubdate") or "") year: int | None = None m_year = re.search(r"(\d{4})", pubdate) if m_year: try: year = int(m_year.group(1)) except Exception: year = None authors: list[str] | None = None raw_authors = meta.get("authors") if isinstance(raw_authors, list): authors = [ str(a.get("name")) for a in raw_authors if isinstance(a, dict) and a.get("name") ] journal = meta.get("fulljournalname") journal = str(journal).strip() if journal else None abstract_text = raw.get("_abstract") abstract: str | None = abstract_text if isinstance(abstract_text, str) else None abstract = ( truncate_text(abstract, abstract_max_chars) if self._include_abstract else None ) url_item = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" result: JSONObject = { "title": title, "year": year, "pmid": pmid, "url": url_item, "authors": cast(JSONValue, authors), "journalTitle": journal, "abstract": abstract, "snippet": abstract or journal, } citation = make_citation( source="pubmed", id_prefix="pubmed", title=title or url_item, url=url_item, authors=authors, year=year, pmid=pmid, snippet=abstract or journal, ) return result, citation