Source code for veupath_chatbot.services.research.clients.arxiv

"""arXiv API client."""

import re

import httpx

from veupath_chatbot.platform.types import JSONObject, JSONValue
from veupath_chatbot.services.research.clients._base import (
    API_USER_AGENT,
    StandardClient,
    make_citation,
)
from veupath_chatbot.services.research.utils import strip_tags, truncate_text


[docs] class ArxivClient(StandardClient): """Client for arXiv API.""" _source_name = "arxiv" async def _fetch_raw(self, query: str, *, limit: int) -> list[JSONValue]: url = "http://export.arxiv.org/api/query" params = { "search_query": f"all:{query}", "start": "0", "max_results": str(limit), } async with httpx.AsyncClient( timeout=self._timeout, headers={"User-Agent": API_USER_AGENT} ) as client: resp = await client.get(url, params=params, follow_redirects=True) resp.raise_for_status() xml = resp.text or "" entries = re.findall( r"<entry>(.*?)</entry>", xml, flags=re.IGNORECASE | re.DOTALL ) return [{"_xml": e} for e in entries[:limit]] def _parse_item( self, raw: JSONValue, *, abstract_max_chars: int ) -> tuple[JSONObject, JSONObject] | None: if not isinstance(raw, dict): return None e = raw.get("_xml") if not isinstance(e, str): return None title = strip_tags( "".join( re.findall(r"<title>(.*?)</title>", e, flags=re.IGNORECASE | re.DOTALL) ) ).strip() link_m = re.search(r'<link[^>]+href="([^"]+)"', e, flags=re.IGNORECASE) url_item = link_m.group(1) if link_m else None abstract = strip_tags( "".join( re.findall( r"<summary>(.*?)</summary>", e, flags=re.IGNORECASE | re.DOTALL ) ) ).strip() result: JSONObject = { "title": title, "url": url_item, "abstract": truncate_text(abstract, abstract_max_chars) or "", "snippet": abstract, } citation = make_citation( source="arxiv", id_prefix="arxiv", title=title or (url_item or "arXiv result"), url=url_item, snippet=abstract, ) return result, citation