Source code for veupath_chatbot.services.research.clients.arxiv
"""arXiv API client."""
import re
import httpx
from veupath_chatbot.platform.types import JSONObject, JSONValue
from veupath_chatbot.services.research.clients._base import (
API_USER_AGENT,
StandardClient,
make_citation,
)
from veupath_chatbot.services.research.utils import strip_tags, truncate_text
[docs]
class ArxivClient(StandardClient):
"""Client for arXiv API."""
_source_name = "arxiv"
async def _fetch_raw(self, query: str, *, limit: int) -> list[JSONValue]:
url = "http://export.arxiv.org/api/query"
params = {
"search_query": f"all:{query}",
"start": "0",
"max_results": str(limit),
}
async with httpx.AsyncClient(
timeout=self._timeout, headers={"User-Agent": API_USER_AGENT}
) as client:
resp = await client.get(url, params=params, follow_redirects=True)
resp.raise_for_status()
xml = resp.text or ""
entries = re.findall(
r"<entry>(.*?)</entry>", xml, flags=re.IGNORECASE | re.DOTALL
)
return [{"_xml": e} for e in entries[:limit]]
def _parse_item(
self, raw: JSONValue, *, abstract_max_chars: int
) -> tuple[JSONObject, JSONObject] | None:
if not isinstance(raw, dict):
return None
e = raw.get("_xml")
if not isinstance(e, str):
return None
title = strip_tags(
"".join(
re.findall(r"<title>(.*?)</title>", e, flags=re.IGNORECASE | re.DOTALL)
)
).strip()
link_m = re.search(r'<link[^>]+href="([^"]+)"', e, flags=re.IGNORECASE)
url_item = link_m.group(1) if link_m else None
abstract = strip_tags(
"".join(
re.findall(
r"<summary>(.*?)</summary>", e, flags=re.IGNORECASE | re.DOTALL
)
)
).strip()
result: JSONObject = {
"title": title,
"url": url_item,
"abstract": truncate_text(abstract, abstract_max_chars) or "",
"snippet": abstract,
}
citation = make_citation(
source="arxiv",
id_prefix="arxiv",
title=title or (url_item or "arXiv result"),
url=url_item,
snippet=abstract,
)
return result, citation